лет назад: 12 · 67a384dfbb
--- a/configure.ac
+++ b/configure.ac
@@ -344,11 +344,6 @@ if test x$enable_cpu = xyes; then
 
				 	AC_DEFINE(STARPU_USE_CPU, [1], [CPU driver is activated])
			
 
				 fi
			
 
				 
			
 
				-# How many parallel worker can we support ?
			
 
				-nmaxcombinedworkers=`expr 2 \* $maxcpus`
			
 
				-AC_DEFINE_UNQUOTED(STARPU_NMAX_COMBINEDWORKERS,
			
 
				-	[$nmaxcombinedworkers], [Maximum number of worker combinations])
			
 
				-
			
 
				 ###############################################################################
			
 
				 #                                                                             #
			
 
				 #                                 CUDA settings                               #
			
@@ -969,7 +964,7 @@ AC_DEFINE_UNQUOTED(STARPU_MAXMICDEVS, [$nmaxmicdev],
 
				 AC_MSG_CHECKING(maximum number of MIC threads)
			
 
				 AC_ARG_ENABLE(maxmicthreads, [AS_HELP_STRING([--enable-maxmicthreads=<number>],
			
 
				 			[maximum number of MIC threads])],
			
 
				-			nmaxmicthreads=$enableval, nmaxmicthreads=128)
			
 
				+			nmaxmicthreads=$enableval, nmaxmicthreads=940)
			
 
				 AC_MSG_RESULT($nmaxmicthread)
			
 
				 
			
 
				 AC_DEFINE_UNQUOTED(STARPU_MAXMICCORES, [$nmaxmicthreads],
			
@@ -998,7 +993,6 @@ AC_DEFUN([STARPU_CHECK_COI_RUNTIME],
 
				     __coi_dir=$1
			
 
				     __coi_include_dir=$2
			
 
				     __coi_lib_dir=$3
			
 
				-    __coi_lib_name=$4
			
 
				 
			
 
				     if test "$__coi_dir" != "no" -a "$__coi_dir" != "" ; then
			
 
				 	AC_MSG_CHECKING(whether MIC's COI runtime is available in $__coi_dir)
			
@@ -1027,14 +1021,14 @@ AC_DEFUN([STARPU_CHECK_COI_RUNTIME],
 
				     AC_CHECK_HEADER([source/COIEngine_source.h],[have_valid_coi=yes],[have_valid_coi=no])
			
 
				 
			
 
				     if test "$have_valid_coi" = "yes" ; then
			
 
				-	AC_HAVE_LIBRARY([$__coi_lib_name],[have_valid_coi=yes],[have_valid_coi=no])
			
 
				+	AC_HAVE_LIBRARY([$4],[have_valid_coi=yes],[have_valid_coi=no])
			
 
				 
			
 
				         if test "$have_valid_coi" = "no" ; then
			
 
				             if test "$3" = "no" -a "$__coi_dir" != "no" ; then
			
 
				 		# ${__coi_dir}/lib didn't work, let's try with lib64
			
 
				                 __coi_lib_dir="$__coi_dir/lib64"
			
 
				 		LDFLAGS="${SAVED_LDFLAGS} -L$__coi_lib_dir"
			
 
				-	        AC_HAVE_LIBRARY([$__coi_lib_name],[have_valid_coi=yes],[have_valid_coi=no])
			
 
				+	        AC_HAVE_LIBRARY([$4],[have_valid_coi=yes],[have_valid_coi=no])
			
 
				             fi
			
 
				         fi
			
 
				     fi
			
@@ -1044,7 +1038,7 @@ AC_DEFUN([STARPU_CHECK_COI_RUNTIME],
 
				     fi
			
 
				 
			
 
				     if test "$have_valid_coi" = "yes" -a "$__coi_lib_dir" != "no"; then
			
 
				-        STARPU_COI_LDFLAGS="-L$__coi_lib_dir -l$__coi_lib_name"
			
 
				+        STARPU_COI_LDFLAGS="-L$__coi_lib_dir -l$4"
			
 
				     fi
			
 
				 
			
 
				     CPPFLAGS="${SAVED_CPPFLAGS}"
			
@@ -1053,12 +1047,12 @@ AC_DEFUN([STARPU_CHECK_COI_RUNTIME],
 
				 
			
 
				 if test x$enable_mic = xyes ; then
			
 
				 
			
 
				-    STARPU_CHECK_COI_RUNTIME($coi_dir, $coi_include_dir, $coi_lib_dir, "coi_host")
			
 
				+    STARPU_CHECK_COI_RUNTIME($coi_dir, $coi_include_dir, $coi_lib_dir, coi_host)
			
 
				 
			
 
				     # Host runtime is not compatible, we are probably cross-compiling
			
 
				     # Let's have a look for the device runtime which lib has a different name
			
 
				     if test "$have_valid_coi" = "no" ; then
			
 
				-	    STARPU_CHECK_COI_RUNTIME($coi_dir, $coi_include_dir, $coi_lib_dir, "coi_device")
			
 
				+	    STARPU_CHECK_COI_RUNTIME($coi_dir, $coi_include_dir, $coi_lib_dir, coi_device)
			
 
				     fi
			
 
				 
			
 
				     if test "$have_valid_coi" = "no" ; then
			
@@ -1478,6 +1472,15 @@ AC_MSG_CHECKING(Maximum number of workers)
 
				 AC_MSG_RESULT($nmaxworkers)
			
 
				 AC_DEFINE_UNQUOTED(STARPU_NMAXWORKERS, [$nmaxworkers], [Maximum number of workers])
			
 
				 
			
 
				+# Computes the maximun number of combined worker
			
 
				+nmaxcombinedworkers=`expr $maxcpus + $nmaxmicthreads`  
			
 
				+AC_MSG_CHECKING(Maximum number of workers combinations)
			
 
				+AC_MSG_RESULT($nmaxcombinedworkers)
			
 
				+AC_DEFINE_UNQUOTED(STARPU_NMAX_COMBINEDWORKERS,
			
 
				+	[$nmaxcombinedworkers], [Maximum number of worker combinations])
			
 
				+
			
 
				+
			
 
				+
			
 
				 # Computes the maximum number of implementations per arch
			
 
				 AC_MSG_CHECKING(maximum number of implementations)
			
 
				 AC_ARG_ENABLE(maximplementations, [AS_HELP_STRING([--enable-maximplementations=<number>],
			
--- a/include/starpu_perfmodel.h
+++ b/include/starpu_perfmodel.h
@@ -38,7 +38,7 @@ enum starpu_perfmodel_archtype
 
				 	STARPU_CUDA_DEFAULT = STARPU_MAXCPUS,
			
 
				 	STARPU_OPENCL_DEFAULT = STARPU_CUDA_DEFAULT + STARPU_MAXCUDADEVS,
			
 
				 	STARPU_MIC_DEFAULT = STARPU_OPENCL_DEFAULT + STARPU_MAXOPENCLDEVS,
			
 
				-	STARPU_SCC_DEFAULT = STARPU_MIC_DEFAULT + STARPU_MAXMICDEVS
			
 
				+	STARPU_SCC_DEFAULT = STARPU_MIC_DEFAULT + STARPU_MAXMICDEVS //* STARPU_MAXMICCPUS
			
 
				 };
			
 
				 
			
 
				 #ifdef __STDC_VERSION__
			
--- a/mic-configure
+++ b/mic-configure
@@ -1,6 +1,6 @@
 
				 #!/bin/bash
			
 
				 
			
 
				-ROOT_DIR=$PWD
			
 
				+ROOT_DIR=$(dirname $0)
			
 
				 
			
 
				 cat > ./mic-config.log << EOF
			
 
				 This file was created by StarPU mic-configure
			
@@ -28,12 +28,23 @@ do
 
				 
			
 
				 done
			
 
				 
			
 
				+x=$(type -t ${mic_host}-gcc)
			
 
				+if [ -z "$x" ]
			
 
				+then
			
 
				+    echo "[error] please add path to ${mic_host}-gcc in your PATH"
			
 
				+    exit 1
			
 
				+fi
			
 
				+
			
 
				 for arch in mic host
			
 
				 do
			
 
				 	# We call the configure script from a build directory further in the
			
 
				 	# arborescence
			
 
				 
			
 
				-	command="${ROOT_DIR}/configure"
			
 
				+	case $ROOT_DIR in
			
 
				+		/*) command="${ROOT_DIR}/configure";;
			
 
				+		*) command="../${ROOT_DIR}/configure";;
			
 
				+	esac
			
 
				+
			
 
				 	params="--enable-mic --with-coi-dir=$coi_dir --prefix=$prefix/$arch"
			
 
				 
			
 
				 	if test x$arch = xmic ; then
			
@@ -44,7 +55,7 @@ do
 
				 	fi
			
 
				 
			
 
				 	# If the build directory doesn't exist yet, create it
			
 
				-	if [ ! -d "${ROOT_DIR}/build_${arch}" ] ; then
			
 
				+	if [ ! -d "build_${arch}" ] ; then
			
 
				 		mkdir "build_${arch}"
			
 
				 	fi
			
 
				 
			
@@ -59,7 +70,7 @@ do
 
				 	then
			
 
				 		exit $?
			
 
				 	fi
			
 
				-	cd "${ROOT_DIR}"
			
 
				+	cd ..
			
 
				 done
			
 
				 
			
 
				 cat > Makefile << EOF
			
--- a/mpi/examples/Makefile.am
+++ b/mpi/examples/Makefile.am
@@ -44,16 +44,13 @@ BUILT_SOURCES =
 
				 
			
 
				 CLEANFILES = *.gcno *.gcda *.linkinfo
			
 
				 
			
 
				-EXTRA_DIST = 				\
			
 
				+EXTRA_DIST = 					\
			
 
				 	mpi_lu/mpi_lu-float.h		\
			
 
				 	mpi_lu/mpi_lu-double.h		\
			
 
				 	mpi_lu/plu_example.c		\
			
 
				-	mpi_lu/plu_implicit_example.c	\
			
 
				-	mpi_lu/plu_outofcore_example.c	\
			
 
				 	mpi_lu/plu_solve.c		\
			
 
				 	mpi_lu/pxlu.h			\
			
 
				 	mpi_lu/pxlu.c			\
			
 
				-	mpi_lu/pxlu_implicit.c		\
			
 
				 	mpi_lu/pxlu_kernels.h		\
			
 
				 	mpi_lu/pxlu_kernels.c		\
			
 
				 	matrix_decomposition/mpi_cholesky_codelets.h 	\
			
@@ -104,11 +101,7 @@ if !NO_BLAS_LIB
 
				 
			
 
				 examplebin_PROGRAMS += 			\
			
 
				 	mpi_lu/plu_example_float	\
			
 
				-	mpi_lu/plu_example_double	\
			
 
				-	mpi_lu/plu_implicit_example_float	\
			
 
				-	mpi_lu/plu_implicit_example_double	\
			
 
				-	mpi_lu/plu_outofcore_example_float	\
			
 
				-	mpi_lu/plu_outofcore_example_double
			
 
				+	mpi_lu/plu_example_double
			
 
				 
			
 
				 mpi_lu_plu_example_float_LDADD =	\
			
 
				 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
			
@@ -133,57 +126,8 @@ mpi_lu_plu_example_double_SOURCES =	\
 
				 	mpi_lu/pdlu_kernels.c	    	\
			
 
				 	mpi_lu/pdlu.c		    	\
			
 
				 	$(top_srcdir)/examples/common/blas.c
			
 
				-
			
 
				-mpi_lu_plu_implicit_example_float_LDADD =	\
			
 
				-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
			
 
				-	$(STARPU_LIBNUMA_LDFLAGS)				\
			
 
				-	$(STARPU_BLAS_LDFLAGS) -lm
			
 
				-
			
 
				-mpi_lu_plu_implicit_example_float_SOURCES =	\
			
 
				-	mpi_lu/plu_implicit_example_float.c	\
			
 
				-	mpi_lu/plu_solve_float.c		\
			
 
				-	mpi_lu/pslu_kernels.c			\
			
 
				-	mpi_lu/pslu_implicit.c			\
			
 
				-	$(top_srcdir)/examples/common/blas.c
			
 
				-
			
 
				-mpi_lu_plu_implicit_example_double_LDADD =	\
			
 
				-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
			
 
				-	$(STARPU_LIBNUMA_LDFLAGS)				\
			
 
				-	$(STARPU_BLAS_LDFLAGS) -lm
			
 
				-
			
 
				-mpi_lu_plu_implicit_example_double_SOURCES =	\
			
 
				-	mpi_lu/plu_outofcore_example_double.c	\
			
 
				-	mpi_lu/plu_solve_double.c		\
			
 
				-	mpi_lu/pdlu_kernels.c			\
			
 
				-	mpi_lu/pdlu_implicit.c			\
			
 
				-	$(top_srcdir)/examples/common/blas.c
			
 
				-
			
 
				-mpi_lu_plu_outofcore_example_float_LDADD =	\
			
 
				-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
			
 
				-	$(STARPU_LIBNUMA_LDFLAGS)				\
			
 
				-	$(STARPU_BLAS_LDFLAGS) -lm
			
 
				-
			
 
				-mpi_lu_plu_outofcore_example_float_SOURCES =	\
			
 
				-	mpi_lu/plu_outofcore_example_float.c	\
			
 
				-	mpi_lu/plu_solve_float.c		\
			
 
				-	mpi_lu/pslu_kernels.c			\
			
 
				-	mpi_lu/pslu_implicit.c			\
			
 
				-	$(top_srcdir)/examples/common/blas.c
			
 
				-
			
 
				-mpi_lu_plu_outofcore_example_double_LDADD =	\
			
 
				-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
			
 
				-	$(STARPU_LIBNUMA_LDFLAGS)				\
			
 
				-	$(STARPU_BLAS_LDFLAGS) -lm
			
 
				-
			
 
				-mpi_lu_plu_outofcore_example_double_SOURCES =	\
			
 
				-	mpi_lu/plu_outofcore_example_double.c	\
			
 
				-	mpi_lu/plu_solve_double.c		\
			
 
				-	mpi_lu/pdlu_kernels.c			\
			
 
				-	mpi_lu/pdlu_implicit.c			\
			
 
				-	$(top_srcdir)/examples/common/blas.c
			
 
				 endif
			
 
				 
			
 
				-
			
 
				 ########################
			
 
				 # MPI Cholesky example #
			
 
				 ########################
			
--- a/mpi/examples/mpi_lu/pdlu_implicit.c
+++ b/mpi/examples/mpi_lu/pdlu_implicit.c
@@ -1,19 +0,0 @@
 
				-/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				- *
			
 
				- * Copyright (C) 2010, 2013  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				- *
			
 
				- * StarPU is free software; you can redistribute it and/or modify
			
 
				- * it under the terms of the GNU Lesser General Public License as published by
			
 
				- * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				- * your option) any later version.
			
 
				- *
			
 
				- * StarPU is distributed in the hope that it will be useful, but
			
 
				- * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				- *
			
 
				- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				- */
			
 
				-
			
 
				-#include "mpi_lu-double.h"
			
 
				-#include "pxlu_implicit.c"
			
--- a/mpi/examples/mpi_lu/plu_example.c
+++ b/mpi/examples/mpi_lu/plu_example.c
@@ -109,12 +109,6 @@ static void parse_args(int rank, int argc, char **argv)
 
				 			char *argptr;
			
 
				 			q = strtol(argv[++i], &argptr, 10);
			
 
				 		}
			
 
				-
			
 
				-		if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "-help") == 0 || strcmp(argv[i], "--help") == 0) {
			
 
				-			fprintf(stderr,"usage: %s [-size n] [-nblocks b] [-check] [-display] [-numa] [-p p] [-q q]\n", argv[0]);
			
 
				-			fprintf(stderr,"\np * q must be equal to the number of MPI nodes\n");
			
 
				-			exit(0);
			
 
				-		}
			
 
				 	}
			
 
				 }
			
 
				 
			
--- a/mpi/examples/mpi_lu/plu_implicit_example.c
+++ b/mpi/examples/mpi_lu/plu_implicit_example.c
@@ -1,357 +0,0 @@
 
				-/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				- *
			
 
				- * Copyright (C) 2010-2011, 2013  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				- *
			
 
				- * StarPU is free software; you can redistribute it and/or modify
			
 
				- * it under the terms of the GNU Lesser General Public License as published by
			
 
				- * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				- * your option) any later version.
			
 
				- *
			
 
				- * StarPU is distributed in the hope that it will be useful, but
			
 
				- * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				- *
			
 
				- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				- */
			
 
				-
			
 
				-#include <stdlib.h>
			
 
				-#include <stdio.h>
			
 
				-#include <string.h>
			
 
				-#include <time.h>
			
 
				-#include <math.h>
			
 
				-#include <starpu.h>
			
 
				-
			
 
				-#include "pxlu.h"
			
 
				-//#include "pxlu_kernels.h"
			
 
				-
			
 
				-#ifdef STARPU_HAVE_LIBNUMA
			
 
				-#include <numaif.h>
			
 
				-#endif
			
 
				-
			
 
				-static unsigned long size = 4096;
			
 
				-static unsigned nblocks = 16;
			
 
				-static unsigned check = 0;
			
 
				-static int p = 1;
			
 
				-static int q = 1;
			
 
				-static unsigned display = 0;
			
 
				-
			
 
				-#ifdef STARPU_HAVE_LIBNUMA
			
 
				-static unsigned numa = 0;
			
 
				-#endif
			
 
				-
			
 
				-static size_t allocated_memory = 0;
			
 
				-static size_t allocated_memory_extra = 0;
			
 
				-
			
 
				-static starpu_data_handle_t *dataA_handles;
			
 
				-static TYPE **dataA;
			
 
				-
			
 
				-int get_block_rank(unsigned i, unsigned j);
			
 
				-
			
 
				-static void parse_args(int argc, char **argv)
			
 
				-{
			
 
				-	int i;
			
 
				-	for (i = 1; i < argc; i++) {
			
 
				-		if (strcmp(argv[i], "-size") == 0) {
			
 
				-			char *argptr;
			
 
				-			size = strtol(argv[++i], &argptr, 10);
			
 
				-		}
			
 
				-
			
 
				-		if (strcmp(argv[i], "-nblocks") == 0) {
			
 
				-			char *argptr;
			
 
				-			nblocks = strtol(argv[++i], &argptr, 10);
			
 
				-		}
			
 
				-
			
 
				-		if (strcmp(argv[i], "-check") == 0) {
			
 
				-			check = 1;
			
 
				-		}
			
 
				-
			
 
				-		if (strcmp(argv[i], "-display") == 0) {
			
 
				-			display = 1;
			
 
				-		}
			
 
				-
			
 
				-		if (strcmp(argv[i], "-numa") == 0) {
			
 
				-#ifdef STARPU_HAVE_LIBNUMA
			
 
				-			numa = 1;
			
 
				-#else
			
 
				-			if (rank == 0)
			
 
				-				fprintf(stderr, "Warning: libnuma is not available\n");
			
 
				-#endif
			
 
				-		}
			
 
				-
			
 
				-		if (strcmp(argv[i], "-p") == 0) {
			
 
				-			char *argptr;
			
 
				-			p = strtol(argv[++i], &argptr, 10);
			
 
				-		}
			
 
				-
			
 
				-		if (strcmp(argv[i], "-q") == 0) {
			
 
				-			char *argptr;
			
 
				-			q = strtol(argv[++i], &argptr, 10);
			
 
				-		}
			
 
				-
			
 
				-		if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "-help") == 0 || strcmp(argv[i], "--help") == 0) {
			
 
				-			fprintf(stderr,"usage: %s [-size n] [-nblocks b] [-check] [-display] [-numa] [-p p] [-q q]\n", argv[0]);
			
 
				-			fprintf(stderr,"\np * q must be equal to the number of MPI nodes\n");
			
 
				-			exit(0);
			
 
				-		}
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-unsigned STARPU_PLU(display_flag)(void)
			
 
				-{
			
 
				-	return display;
			
 
				-}
			
 
				-
			
 
				-static void fill_block_with_random(TYPE *blockptr, unsigned psize, unsigned pnblocks)
			
 
				-{
			
 
				-	const unsigned block_size = (psize/pnblocks);
			
 
				-
			
 
				-	unsigned i, j;
			
 
				-	for (i = 0; i < block_size; i++)
			
 
				-	     for (j = 0; j < block_size; j++)
			
 
				-	     {
			
 
				-		  blockptr[j+i*block_size] = (TYPE)starpu_drand48();
			
 
				-	     }
			
 
				-}
			
 
				-
			
 
				-static void init_matrix(int rank)
			
 
				-{
			
 
				-#ifdef STARPU_HAVE_LIBNUMA
			
 
				-	if (numa)
			
 
				-	{
			
 
				-		fprintf(stderr, "Using INTERLEAVE policy\n");
			
 
				-		unsigned long nodemask = ((1<<0)|(1<<1));
			
 
				-		int ret = set_mempolicy(MPOL_INTERLEAVE, &nodemask, 3);
			
 
				-		if (ret)
			
 
				-			perror("set_mempolicy failed");
			
 
				-	}
			
 
				-#endif
			
 
				-
			
 
				-	/* Allocate a grid of data handles, not all of them have to be allocated later on */
			
 
				-	dataA_handles = calloc(nblocks*nblocks, sizeof(starpu_data_handle_t));
			
 
				-	dataA = calloc(nblocks*nblocks, sizeof(TYPE *));
			
 
				-	allocated_memory_extra += nblocks*nblocks*(sizeof(starpu_data_handle_t) + sizeof(TYPE *));
			
 
				-
			
 
				-	size_t blocksize = (size_t)(size/nblocks)*(size/nblocks)*sizeof(TYPE);
			
 
				-
			
 
				-	/* Allocate all the blocks that belong to this mpi node */
			
 
				-	unsigned long i,j;
			
 
				-	for (j = 0; j < nblocks; j++)
			
 
				-	{
			
 
				-		for (i = 0; i < nblocks; i++)
			
 
				-		{
			
 
				-			int block_rank = get_block_rank(i, j);
			
 
				-			TYPE **blockptr = &dataA[j+i*nblocks];
			
 
				-//			starpu_data_handle_t *handleptr = &dataA_handles[j+nblocks*i];
			
 
				-			starpu_data_handle_t *handleptr = &dataA_handles[j+nblocks*i];
			
 
				-
			
 
				-			if (block_rank == rank)
			
 
				-			{
			
 
				-				/* This blocks should be treated by the current MPI process */
			
 
				-				/* Allocate and fill it */
			
 
				-				starpu_malloc((void **)blockptr, blocksize);
			
 
				-				allocated_memory += blocksize;
			
 
				-
			
 
				-				//fprintf(stderr, "Rank %d : fill block (i = %d, j = %d)\n", rank, i, j);
			
 
				-				fill_block_with_random(*blockptr, size, nblocks);
			
 
				-				//fprintf(stderr, "Rank %d : fill block (i = %d, j = %d)\n", rank, i, j);
			
 
				-				if (i == j)
			
 
				-				{
			
 
				-					unsigned tmp;
			
 
				-					for (tmp = 0; tmp < size/nblocks; tmp++)
			
 
				-					{
			
 
				-						(*blockptr)[tmp*((size/nblocks)+1)] += (TYPE)10*nblocks;
			
 
				-					}
			
 
				-				}
			
 
				-
			
 
				-				/* Register it to StarPU */
			
 
				-				starpu_matrix_data_register(handleptr, STARPU_MAIN_RAM,
			
 
				-					(uintptr_t)*blockptr, size/nblocks,
			
 
				-					size/nblocks, size/nblocks, sizeof(TYPE));
			
 
				-			}
			
 
				-			else {
			
 
				-				starpu_matrix_data_register(handleptr, -1,
			
 
				-					0, size/nblocks,
			
 
				-					size/nblocks, size/nblocks, sizeof(TYPE));
			
 
				-				*blockptr = STARPU_POISON_PTR;
			
 
				-			}
			
 
				-			starpu_data_set_rank(*handleptr, block_rank);
			
 
				-			starpu_data_set_tag(*handleptr, j+i*nblocks);
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	//display_all_blocks(nblocks, size/nblocks);
			
 
				-}
			
 
				-
			
 
				-TYPE *STARPU_PLU(get_block)(unsigned i, unsigned j)
			
 
				-{
			
 
				-	return dataA[j+i*nblocks];
			
 
				-}
			
 
				-
			
 
				-int get_block_rank(unsigned i, unsigned j)
			
 
				-{
			
 
				-	/* Take a 2D block cyclic distribution */
			
 
				-	/* NB: p (resp. q) is for "direction" i (resp. j) */
			
 
				-	return (j % q) * p + (i % p);
			
 
				-}
			
 
				-
			
 
				-starpu_data_handle_t STARPU_PLU(get_block_handle)(unsigned i, unsigned j)
			
 
				-{
			
 
				-	return dataA_handles[j+i*nblocks];
			
 
				-}
			
 
				-
			
 
				-static void display_grid(int rank, unsigned pnblocks)
			
 
				-{
			
 
				-	if (!display)
			
 
				-		return;
			
 
				-
			
 
				-	//if (rank == 0)
			
 
				-	{
			
 
				-		fprintf(stderr, "2D grid layout (Rank %d): \n", rank);
			
 
				-
			
 
				-		unsigned i, j;
			
 
				-		for (j = 0; j < pnblocks; j++)
			
 
				-		{
			
 
				-			for (i = 0; i < pnblocks; i++)
			
 
				-			{
			
 
				-				TYPE *blockptr = STARPU_PLU(get_block)(i, j);
			
 
				-				starpu_data_handle_t handle = STARPU_PLU(get_block_handle)(i, j);
			
 
				-
			
 
				-				fprintf(stderr, "%d (data %p handle %p)", get_block_rank(i, j), blockptr, handle);
			
 
				-			}
			
 
				-			fprintf(stderr, "\n");
			
 
				-		}
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-int main(int argc, char **argv)
			
 
				-{
			
 
				-	int rank;
			
 
				-	int world_size;
			
 
				-
			
 
				-	starpu_srand48((long int)time(NULL));
			
 
				-
			
 
				-	parse_args(argc, argv);
			
 
				-
			
 
				-	int ret = starpu_init(NULL);
			
 
				-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				-
			
 
				-	ret = starpu_mpi_init(&argc, &argv, 1);
			
 
				-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				-
			
 
				-	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				-	MPI_Comm_size(MPI_COMM_WORLD, &world_size);
			
 
				-
			
 
				-	STARPU_ASSERT(p*q == world_size);
			
 
				-
			
 
				-	starpu_cublas_init();
			
 
				-
			
 
				-	/*
			
 
				-	 * 	Problem Init
			
 
				-	 */
			
 
				-
			
 
				-	init_matrix(rank);
			
 
				-
			
 
				-	fprintf(stderr, "Rank %d: allocated (%d + %d) MB = %d MB\n", rank,
			
 
				-                        (int)(allocated_memory/(1024*1024)),
			
 
				-			(int)(allocated_memory_extra/(1024*1024)),
			
 
				-                        (int)((allocated_memory+allocated_memory_extra)/(1024*1024)));
			
 
				-
			
 
				-	display_grid(rank, nblocks);
			
 
				-
			
 
				-	TYPE *a_r = NULL;
			
 
				-//	STARPU_PLU(display_data_content)(a_r, size);
			
 
				-
			
 
				-	TYPE *x, *y;
			
 
				-
			
 
				-	if (check)
			
 
				-	{
			
 
				-		x = calloc(size, sizeof(TYPE));
			
 
				-		STARPU_ASSERT(x);
			
 
				-
			
 
				-		y = calloc(size, sizeof(TYPE));
			
 
				-		STARPU_ASSERT(y);
			
 
				-
			
 
				-		if (rank == 0)
			
 
				-		{
			
 
				-			unsigned ind;
			
 
				-			for (ind = 0; ind < size; ind++)
			
 
				-				x[ind] = (TYPE)starpu_drand48();
			
 
				-		}
			
 
				-
			
 
				-		a_r = STARPU_PLU(reconstruct_matrix)(size, nblocks);
			
 
				-
			
 
				-		if (rank == 0)
			
 
				-			STARPU_PLU(display_data_content)(a_r, size);
			
 
				-
			
 
				-//		STARPU_PLU(compute_ax)(size, x, y, nblocks, rank);
			
 
				-	}
			
 
				-
			
 
				-	double timing = STARPU_PLU(plu_main)(nblocks, rank, world_size);
			
 
				-
			
 
				-	/*
			
 
				-	 * 	Report performance
			
 
				-	 */
			
 
				-
			
 
				-	if (rank == 0)
			
 
				-	{
			
 
				-		fprintf(stderr, "Computation took: %f ms\n", timing/1000);
			
 
				-
			
 
				-		unsigned n = size;
			
 
				-		double flop = (2.0f*n*n*n)/3.0f;
			
 
				-		fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
			
 
				-	}
			
 
				-
			
 
				-	/*
			
 
				-	 *	Test Result Correctness
			
 
				-	 */
			
 
				-
			
 
				-	if (check)
			
 
				-	{
			
 
				-		/*
			
 
				-		 *	Compute || A - LU ||
			
 
				-		 */
			
 
				-
			
 
				-		STARPU_PLU(compute_lu_matrix)(size, nblocks, a_r);
			
 
				-
			
 
				-#if 0
			
 
				-		/*
			
 
				-		 *	Compute || Ax - LUx ||
			
 
				-		 */
			
 
				-
			
 
				-		unsigned ind;
			
 
				-
			
 
				-		y2 = calloc(size, sizeof(TYPE));
			
 
				-		STARPU_ASSERT(y);
			
 
				-
			
 
				-		if (rank == 0)
			
 
				-		{
			
 
				-			for (ind = 0; ind < size; ind++)
			
 
				-			{
			
 
				-				y2[ind] = (TYPE)0.0;
			
 
				-			}
			
 
				-		}
			
 
				-
			
 
				-		STARPU_PLU(compute_lux)(size, x, y2, nblocks, rank);
			
 
				-
			
 
				-		/* Compute y2 = y2 - y */
			
 
				-		CPU_AXPY(size, -1.0, y, 1, y2, 1);
			
 
				-
			
 
				-		TYPE err = CPU_ASUM(size, y2, 1);
			
 
				-		int max = CPU_IAMAX(size, y2, 1);
			
 
				-
			
 
				-		fprintf(stderr, "(A - LU)X Avg error : %e\n", err/(size*size));
			
 
				-		fprintf(stderr, "(A - LU)X Max error : %e\n", y2[max]);
			
 
				-#endif
			
 
				-	}
			
 
				-
			
 
				-	/*
			
 
				-	 * 	Termination
			
 
				-	 */
			
 
				-
			
 
				-	starpu_cublas_shutdown();
			
 
				-	starpu_mpi_shutdown();
			
 
				-	starpu_shutdown();
			
 
				-
			
 
				-	return 0;
			
 
				-}
			
--- a/mpi/examples/mpi_lu/plu_implicit_example_double.c
+++ b/mpi/examples/mpi_lu/plu_implicit_example_double.c
@@ -1,19 +0,0 @@
 
				-/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				- *
			
 
				- * Copyright (C) 2010, 2013  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				- *
			
 
				- * StarPU is free software; you can redistribute it and/or modify
			
 
				- * it under the terms of the GNU Lesser General Public License as published by
			
 
				- * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				- * your option) any later version.
			
 
				- *
			
 
				- * StarPU is distributed in the hope that it will be useful, but
			
 
				- * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				- *
			
 
				- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				- */
			
 
				-
			
 
				-#include "mpi_lu-double.h"
			
 
				-#include "plu_implicit_example.c"
			
--- a/mpi/examples/mpi_lu/plu_implicit_example_float.c
+++ b/mpi/examples/mpi_lu/plu_implicit_example_float.c
@@ -1,19 +0,0 @@
 
				-/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				- *
			
 
				- * Copyright (C) 2010, 2013  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				- *
			
 
				- * StarPU is free software; you can redistribute it and/or modify
			
 
				- * it under the terms of the GNU Lesser General Public License as published by
			
 
				- * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				- * your option) any later version.
			
 
				- *
			
 
				- * StarPU is distributed in the hope that it will be useful, but
			
 
				- * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				- *
			
 
				- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				- */
			
 
				-
			
 
				-#include "mpi_lu-float.h"
			
 
				-#include "plu_implicit_example.c"
			
--- a/mpi/examples/mpi_lu/plu_outofcore_example.c
+++ b/mpi/examples/mpi_lu/plu_outofcore_example.c
@@ -1,381 +0,0 @@
 
				-/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				- *
			
 
				- * Copyright (C) 2010-2011, 2013  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				- *
			
 
				- * StarPU is free software; you can redistribute it and/or modify
			
 
				- * it under the terms of the GNU Lesser General Public License as published by
			
 
				- * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				- * your option) any later version.
			
 
				- *
			
 
				- * StarPU is distributed in the hope that it will be useful, but
			
 
				- * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				- *
			
 
				- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				- */
			
 
				-
			
 
				-#include <stdlib.h>
			
 
				-#include <stdio.h>
			
 
				-#include <string.h>
			
 
				-#include <time.h>
			
 
				-#include <math.h>
			
 
				-#include <starpu.h>
			
 
				-#include <fcntl.h>
			
 
				-#include <sys/stat.h>
			
 
				-
			
 
				-#include "pxlu.h"
			
 
				-//#include "pxlu_kernels.h"
			
 
				-
			
 
				-#ifdef STARPU_HAVE_LIBNUMA
			
 
				-#include <numaif.h>
			
 
				-#endif
			
 
				-
			
 
				-static unsigned long size = 4096;
			
 
				-static unsigned nblocks = 16;
			
 
				-static unsigned check = 0;
			
 
				-static int p = 1;
			
 
				-static int q = 1;
			
 
				-static unsigned display = 0;
			
 
				-static char *path = "/tmp/starpu-mpi_LU";
			
 
				-
			
 
				-#ifdef STARPU_HAVE_LIBNUMA
			
 
				-static unsigned numa = 0;
			
 
				-#endif
			
 
				-
			
 
				-static size_t allocated_memory = 0;
			
 
				-
			
 
				-static starpu_data_handle_t *dataA_handles;
			
 
				-
			
 
				-int get_block_rank(unsigned i, unsigned j);
			
 
				-
			
 
				-static void parse_args(int argc, char **argv)
			
 
				-{
			
 
				-	int i;
			
 
				-	for (i = 1; i < argc; i++) {
			
 
				-		if (strcmp(argv[i], "-size") == 0) {
			
 
				-			char *argptr;
			
 
				-			size = strtol(argv[++i], &argptr, 10);
			
 
				-		}
			
 
				-
			
 
				-		if (strcmp(argv[i], "-nblocks") == 0) {
			
 
				-			char *argptr;
			
 
				-			nblocks = strtol(argv[++i], &argptr, 10);
			
 
				-		}
			
 
				-
			
 
				-		if (strcmp(argv[i], "-check") == 0) {
			
 
				-			check = 1;
			
 
				-		}
			
 
				-
			
 
				-		if (strcmp(argv[i], "-display") == 0) {
			
 
				-			display = 1;
			
 
				-		}
			
 
				-
			
 
				-		if (strcmp(argv[i], "-numa") == 0) {
			
 
				-#ifdef STARPU_HAVE_LIBNUMA
			
 
				-			numa = 1;
			
 
				-#else
			
 
				-			if (rank == 0)
			
 
				-				fprintf(stderr, "Warning: libnuma is not available\n");
			
 
				-#endif
			
 
				-		}
			
 
				-
			
 
				-		if (strcmp(argv[i], "-p") == 0) {
			
 
				-			char *argptr;
			
 
				-			p = strtol(argv[++i], &argptr, 10);
			
 
				-		}
			
 
				-
			
 
				-		if (strcmp(argv[i], "-q") == 0) {
			
 
				-			char *argptr;
			
 
				-			q = strtol(argv[++i], &argptr, 10);
			
 
				-		}
			
 
				-
			
 
				-		if (strcmp(argv[i], "-path") == 0)
			
 
				-			path = argv[++i];
			
 
				-
			
 
				-		if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "-help") == 0 || strcmp(argv[i], "--help") == 0) {
			
 
				-			fprintf(stderr,"usage: %s [-size n] [-nblocks b] [-check] [-display] [-numa] [-p p] [-q q] [-path PATH]\n", argv[0]);
			
 
				-			fprintf(stderr,"\np * q must be equal to the number of MPI nodes\n");
			
 
				-			exit(0);
			
 
				-		}
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-unsigned STARPU_PLU(display_flag)(void)
			
 
				-{
			
 
				-	return display;
			
 
				-}
			
 
				-
			
 
				-static void fill_block_with_random(TYPE *blockptr, unsigned psize, unsigned pnblocks)
			
 
				-{
			
 
				-	const unsigned block_size = (psize/pnblocks);
			
 
				-
			
 
				-	unsigned i, j;
			
 
				-	for (i = 0; i < block_size; i++)
			
 
				-	     for (j = 0; j < block_size; j++)
			
 
				-	     {
			
 
				-		  blockptr[j+i*block_size] = (TYPE)starpu_drand48();
			
 
				-	     }
			
 
				-}
			
 
				-
			
 
				-static void create_matrix()
			
 
				-{
			
 
				-	size_t blocksize = (size_t)(size/nblocks)*(size/nblocks)*sizeof(TYPE);
			
 
				-	TYPE *blockptr = malloc(blocksize);
			
 
				-	int fd;
			
 
				-	char *filename;
			
 
				-	unsigned filename_length = strlen(path) + 1 + sizeof(nblocks)*3 + 1 + sizeof(nblocks)*3 + 1;
			
 
				-
			
 
				-	filename = malloc(filename_length);
			
 
				-
			
 
				-	allocated_memory += nblocks*nblocks*blocksize*sizeof(TYPE *);
			
 
				-
			
 
				-	/* Create the whole matrix on the disk */
			
 
				-	unsigned i,j;
			
 
				-	for (j = 0; j < nblocks; j++)
			
 
				-	{
			
 
				-		for (i = 0; i < nblocks; i++)
			
 
				-		{
			
 
				-			fill_block_with_random(blockptr, size, nblocks);
			
 
				-			if (i == j)
			
 
				-			{
			
 
				-				unsigned tmp;
			
 
				-				for (tmp = 0; tmp < size/nblocks; tmp++)
			
 
				-				{
			
 
				-					blockptr[tmp*((size/nblocks)+1)] += (TYPE)10*nblocks;
			
 
				-				}
			
 
				-			}
			
 
				-			snprintf(filename, filename_length, "%s/%u,%u", path, i, j);
			
 
				-			fd = open(filename, O_WRONLY|O_CREAT|O_TRUNC, 0777);
			
 
				-			if (fd < 0) {
			
 
				-				perror("open");
			
 
				-				exit(1);
			
 
				-			}
			
 
				-			if (write(fd, blockptr, blocksize) != (ssize_t) blocksize) {
			
 
				-				fprintf(stderr,"short write");
			
 
				-				exit(1);
			
 
				-			}
			
 
				-			if (close(fd) < 0) {
			
 
				-				perror("close");
			
 
				-				exit(1);
			
 
				-			}
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	free(blockptr);
			
 
				-	free(filename);
			
 
				-}
			
 
				-
			
 
				-static void init_matrix(int rank)
			
 
				-{
			
 
				-	/* Allocate a grid of data handles, not all of them have to be allocated later on */
			
 
				-	dataA_handles = calloc(nblocks*nblocks, sizeof(starpu_data_handle_t));
			
 
				-
			
 
				-	size_t blocksize = (size_t)(size/nblocks)*(size/nblocks)*sizeof(TYPE);
			
 
				-
			
 
				-	int disk_node = starpu_disk_register(&starpu_disk_unistd_ops, path, STARPU_MAX(1024*1024, size*size*sizeof(TYPE)));
			
 
				-
			
 
				-	char filename[sizeof(nblocks)*3 + 1 + sizeof(nblocks)*3 + 1];
			
 
				-
			
 
				-	/* Allocate all the blocks that belong to this mpi node */
			
 
				-	unsigned i,j;
			
 
				-	for (j = 0; j < nblocks; j++)
			
 
				-	{
			
 
				-		for (i = 0; i < nblocks; i++)
			
 
				-		{
			
 
				-			int block_rank = get_block_rank(i, j);
			
 
				-//			starpu_data_handle_t *handleptr = &dataA_handles[j+nblocks*i];
			
 
				-			starpu_data_handle_t *handleptr = &dataA_handles[j+nblocks*i];
			
 
				-
			
 
				-			if (block_rank == rank)
			
 
				-			{
			
 
				-				void *disk_obj;
			
 
				-				snprintf(filename, sizeof(filename), "%u,%u", i, j);
			
 
				-				/* Register it to StarPU */
			
 
				-				disk_obj = starpu_disk_open(disk_node, filename, blocksize);
			
 
				-				if (!disk_obj) {
			
 
				-					fprintf(stderr,"could not open %s\n", filename);
			
 
				-					exit(1);
			
 
				-				}
			
 
				-				starpu_matrix_data_register(handleptr, disk_node,
			
 
				-					(uintptr_t) disk_obj, size/nblocks,
			
 
				-					size/nblocks, size/nblocks, sizeof(TYPE));
			
 
				-			}
			
 
				-			else {
			
 
				-				starpu_matrix_data_register(handleptr, -1,
			
 
				-					0, size/nblocks,
			
 
				-					size/nblocks, size/nblocks, sizeof(TYPE));
			
 
				-			}
			
 
				-			starpu_data_set_rank(*handleptr, block_rank);
			
 
				-			starpu_data_set_tag(*handleptr, j+i*nblocks);
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	//display_all_blocks(nblocks, size/nblocks);
			
 
				-}
			
 
				-
			
 
				-TYPE *STARPU_PLU(get_block)(unsigned i, unsigned j)
			
 
				-{
			
 
				-	/* This does not really make sense in out of core */
			
 
				-	assert(0);
			
 
				-}
			
 
				-
			
 
				-int get_block_rank(unsigned i, unsigned j)
			
 
				-{
			
 
				-	/* Take a 2D block cyclic distribution */
			
 
				-	/* NB: p (resp. q) is for "direction" i (resp. j) */
			
 
				-	return (j % q) * p + (i % p);
			
 
				-}
			
 
				-
			
 
				-starpu_data_handle_t STARPU_PLU(get_block_handle)(unsigned i, unsigned j)
			
 
				-{
			
 
				-	return dataA_handles[j+i*nblocks];
			
 
				-}
			
 
				-
			
 
				-int main(int argc, char **argv)
			
 
				-{
			
 
				-	int rank;
			
 
				-	int world_size;
			
 
				-	int ret;
			
 
				-	unsigned i, j;
			
 
				-
			
 
				-	starpu_srand48((long int)time(NULL));
			
 
				-
			
 
				-	parse_args(argc, argv);
			
 
				-
			
 
				-	ret = mkdir(path, 0777);
			
 
				-	if (ret != 0 && errno != EEXIST) {
			
 
				-		fprintf(stderr,"%s does not exist\n", path);
			
 
				-		exit(1);
			
 
				-	}
			
 
				-
			
 
				-	ret = starpu_init(NULL);
			
 
				-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				-
			
 
				-	ret = starpu_mpi_init(&argc, &argv, 1);
			
 
				-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				-
			
 
				-	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				-	MPI_Comm_size(MPI_COMM_WORLD, &world_size);
			
 
				-
			
 
				-	STARPU_ASSERT(p*q == world_size);
			
 
				-
			
 
				-	starpu_cublas_init();
			
 
				-
			
 
				-	/*
			
 
				-	 * 	Problem Init
			
 
				-	 */
			
 
				-
			
 
				-	if (rank == 0)
			
 
				-		create_matrix();
			
 
				-
			
 
				-	starpu_mpi_barrier(MPI_COMM_WORLD);
			
 
				-
			
 
				-	init_matrix(rank);
			
 
				-
			
 
				-	if (rank == 0)
			
 
				-		fprintf(stderr, "%dMB on disk\n", (int)(allocated_memory/(1024*1024)));
			
 
				-
			
 
				-	TYPE *a_r = NULL;
			
 
				-//	STARPU_PLU(display_data_content)(a_r, size);
			
 
				-
			
 
				-	TYPE *x, *y;
			
 
				-
			
 
				-	if (check)
			
 
				-	{
			
 
				-		x = calloc(size, sizeof(TYPE));
			
 
				-		STARPU_ASSERT(x);
			
 
				-
			
 
				-		y = calloc(size, sizeof(TYPE));
			
 
				-		STARPU_ASSERT(y);
			
 
				-
			
 
				-		if (rank == 0)
			
 
				-		{
			
 
				-			unsigned ind;
			
 
				-			for (ind = 0; ind < size; ind++)
			
 
				-				x[ind] = (TYPE)starpu_drand48();
			
 
				-		}
			
 
				-
			
 
				-		a_r = STARPU_PLU(reconstruct_matrix)(size, nblocks);
			
 
				-
			
 
				-		if (rank == 0)
			
 
				-			STARPU_PLU(display_data_content)(a_r, size);
			
 
				-
			
 
				-//		STARPU_PLU(compute_ax)(size, x, y, nblocks, rank);
			
 
				-	}
			
 
				-
			
 
				-	double timing = STARPU_PLU(plu_main)(nblocks, rank, world_size);
			
 
				-
			
 
				-	/*
			
 
				-	 * 	Report performance
			
 
				-	 */
			
 
				-
			
 
				-	if (rank == 0)
			
 
				-	{
			
 
				-		fprintf(stderr, "Computation took: %f ms\n", timing/1000);
			
 
				-
			
 
				-		unsigned n = size;
			
 
				-		double flop = (2.0f*n*n*n)/3.0f;
			
 
				-		fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
			
 
				-	}
			
 
				-
			
 
				-	/*
			
 
				-	 *	Test Result Correctness
			
 
				-	 */
			
 
				-
			
 
				-	if (check)
			
 
				-	{
			
 
				-		/*
			
 
				-		 *	Compute || A - LU ||
			
 
				-		 */
			
 
				-
			
 
				-		STARPU_PLU(compute_lu_matrix)(size, nblocks, a_r);
			
 
				-
			
 
				-#if 0
			
 
				-		/*
			
 
				-		 *	Compute || Ax - LUx ||
			
 
				-		 */
			
 
				-
			
 
				-		unsigned ind;
			
 
				-
			
 
				-		y2 = calloc(size, sizeof(TYPE));
			
 
				-		STARPU_ASSERT(y);
			
 
				-
			
 
				-		if (rank == 0)
			
 
				-		{
			
 
				-			for (ind = 0; ind < size; ind++)
			
 
				-			{
			
 
				-				y2[ind] = (TYPE)0.0;
			
 
				-			}
			
 
				-		}
			
 
				-
			
 
				-		STARPU_PLU(compute_lux)(size, x, y2, nblocks, rank);
			
 
				-
			
 
				-		/* Compute y2 = y2 - y */
			
 
				-		CPU_AXPY(size, -1.0, y, 1, y2, 1);
			
 
				-
			
 
				-		TYPE err = CPU_ASUM(size, y2, 1);
			
 
				-		int max = CPU_IAMAX(size, y2, 1);
			
 
				-
			
 
				-		fprintf(stderr, "(A - LU)X Avg error : %e\n", err/(size*size));
			
 
				-		fprintf(stderr, "(A - LU)X Max error : %e\n", y2[max]);
			
 
				-#endif
			
 
				-	}
			
 
				-
			
 
				-	/*
			
 
				-	 * 	Termination
			
 
				-	 */
			
 
				-	for (j = 0; j < nblocks; j++)
			
 
				-	{
			
 
				-		for (i = 0; i < nblocks; i++)
			
 
				-		{
			
 
				-			starpu_data_unregister(dataA_handles[j+nblocks*i]);
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	starpu_cublas_shutdown();
			
 
				-	starpu_mpi_shutdown();
			
 
				-	starpu_shutdown();
			
 
				-
			
 
				-	return 0;
			
 
				-}
			
--- a/mpi/examples/mpi_lu/plu_outofcore_example_double.c
+++ b/mpi/examples/mpi_lu/plu_outofcore_example_double.c
@@ -1,19 +0,0 @@
 
				-/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				- *
			
 
				- * Copyright (C) 2010, 2013  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				- *
			
 
				- * StarPU is free software; you can redistribute it and/or modify
			
 
				- * it under the terms of the GNU Lesser General Public License as published by
			
 
				- * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				- * your option) any later version.
			
 
				- *
			
 
				- * StarPU is distributed in the hope that it will be useful, but
			
 
				- * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				- *
			
 
				- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				- */
			
 
				-
			
 
				-#include "mpi_lu-double.h"
			
 
				-#include "plu_outofcore_example.c"
			
--- a/mpi/examples/mpi_lu/plu_outofcore_example_float.c
+++ b/mpi/examples/mpi_lu/plu_outofcore_example_float.c
@@ -1,19 +0,0 @@
 
				-/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				- *
			
 
				- * Copyright (C) 2010, 2013  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				- *
			
 
				- * StarPU is free software; you can redistribute it and/or modify
			
 
				- * it under the terms of the GNU Lesser General Public License as published by
			
 
				- * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				- * your option) any later version.
			
 
				- *
			
 
				- * StarPU is distributed in the hope that it will be useful, but
			
 
				- * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				- *
			
 
				- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				- */
			
 
				-
			
 
				-#include "mpi_lu-float.h"
			
 
				-#include "plu_outofcore_example.c"
			
--- a/mpi/examples/mpi_lu/pslu_implicit.c
+++ b/mpi/examples/mpi_lu/pslu_implicit.c
@@ -1,19 +0,0 @@
 
				-/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				- *
			
 
				- * Copyright (C) 2010, 2013  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010  Centre National de la Recherche Scientifique
			
 
				- *
			
 
				- * StarPU is free software; you can redistribute it and/or modify
			
 
				- * it under the terms of the GNU Lesser General Public License as published by
			
 
				- * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				- * your option) any later version.
			
 
				- *
			
 
				- * StarPU is distributed in the hope that it will be useful, but
			
 
				- * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				- *
			
 
				- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				- */
			
 
				-
			
 
				-#include "mpi_lu-float.h"
			
 
				-#include "pxlu_implicit.c"
			
--- a/mpi/examples/mpi_lu/pxlu_implicit.c
+++ b/mpi/examples/mpi_lu/pxlu_implicit.c
@@ -1,162 +0,0 @@
 
				-/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				- *
			
 
				- * Copyright (C) 2010, 2011, 2013  Université de Bordeaux 1
			
 
				- * Copyright (C) 2010, 2012, 2013  Centre National de la Recherche Scientifique
			
 
				- *
			
 
				- * StarPU is free software; you can redistribute it and/or modify
			
 
				- * it under the terms of the GNU Lesser General Public License as published by
			
 
				- * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				- * your option) any later version.
			
 
				- *
			
 
				- * StarPU is distributed in the hope that it will be useful, but
			
 
				- * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				- *
			
 
				- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				- */
			
 
				-
			
 
				-#include "pxlu.h"
			
 
				-#include "pxlu_kernels.h"
			
 
				-#include <sys/time.h>
			
 
				-
			
 
				-//#define VERBOSE_INIT	1
			
 
				-
			
 
				-//#define DEBUG	1
			
 
				-
			
 
				-static unsigned no_prio = 0;
			
 
				-
			
 
				-static unsigned nblocks = 0;
			
 
				-static int rank = -1;
			
 
				-static int world_size = -1;
			
 
				-
			
 
				-struct callback_arg {
			
 
				-	unsigned i, j, k;
			
 
				-};
			
 
				-
			
 
				-/*
			
 
				- *	Task 11 (diagonal factorization)
			
 
				- */
			
 
				-
			
 
				-static void create_task_11(unsigned k)
			
 
				-{
			
 
				-	starpu_mpi_insert_task(MPI_COMM_WORLD,
			
 
				-			&STARPU_PLU(cl11),
			
 
				-			STARPU_VALUE, &k, sizeof(k),
			
 
				-			STARPU_VALUE, &k, sizeof(k),
			
 
				-			STARPU_VALUE, &k, sizeof(k),
			
 
				-			STARPU_RW, STARPU_PLU(get_block_handle)(k, k),
			
 
				-			STARPU_PRIORITY, !no_prio ?
			
 
				-				STARPU_MAX_PRIO : STARPU_MIN_PRIO,
			
 
				-			0);
			
 
				-}
			
 
				-
			
 
				-/*
			
 
				- *	Task 12 (Update lower left (TRSM))
			
 
				- */
			
 
				-
			
 
				-static void create_task_12(unsigned k, unsigned j)
			
 
				-{
			
 
				-#warning temporary fix 
			
 
				-	starpu_mpi_insert_task(MPI_COMM_WORLD,
			
 
				-			//&STARPU_PLU(cl12),
			
 
				-			&STARPU_PLU(cl21),
			
 
				-			STARPU_VALUE, &j, sizeof(j),
			
 
				-			STARPU_VALUE, &j, sizeof(j),
			
 
				-			STARPU_VALUE, &k, sizeof(k),
			
 
				-			STARPU_R, STARPU_PLU(get_block_handle)(k, k),
			
 
				-			STARPU_RW, STARPU_PLU(get_block_handle)(k, j),
			
 
				-			STARPU_PRIORITY, !no_prio && (j == k+1) ?
			
 
				-				STARPU_MAX_PRIO : STARPU_MIN_PRIO,
			
 
				-			0);
			
 
				-}
			
 
				-
			
 
				-/*
			
 
				- *	Task 21 (Update upper right (TRSM))
			
 
				- */
			
 
				-
			
 
				-static void create_task_21(unsigned k, unsigned i)
			
 
				-{
			
 
				-#warning temporary fix 
			
 
				-	starpu_mpi_insert_task(MPI_COMM_WORLD,
			
 
				-			//&STARPU_PLU(cl21),
			
 
				-			&STARPU_PLU(cl12),
			
 
				-			STARPU_VALUE, &i, sizeof(i),
			
 
				-			STARPU_VALUE, &i, sizeof(i),
			
 
				-			STARPU_VALUE, &k, sizeof(k),
			
 
				-			STARPU_R, STARPU_PLU(get_block_handle)(k, k),
			
 
				-			STARPU_RW, STARPU_PLU(get_block_handle)(i, k),
			
 
				-			STARPU_PRIORITY, !no_prio && (i == k+1) ?
			
 
				-				STARPU_MAX_PRIO : STARPU_MIN_PRIO,
			
 
				-			0);
			
 
				-}
			
 
				-
			
 
				-/*
			
 
				- *	Task 22 (GEMM)
			
 
				- */
			
 
				-
			
 
				-static void create_task_22(unsigned k, unsigned i, unsigned j)
			
 
				-{
			
 
				-	starpu_mpi_insert_task(MPI_COMM_WORLD,
			
 
				-			&STARPU_PLU(cl22),
			
 
				-			STARPU_VALUE, &i, sizeof(i),
			
 
				-			STARPU_VALUE, &j, sizeof(j),
			
 
				-			STARPU_VALUE, &k, sizeof(k),
			
 
				-			STARPU_R, STARPU_PLU(get_block_handle)(k, j),
			
 
				-			STARPU_R, STARPU_PLU(get_block_handle)(i, k),
			
 
				-			STARPU_RW, STARPU_PLU(get_block_handle)(i, j),
			
 
				-			STARPU_PRIORITY, !no_prio && (i == k + 1) && (j == k +1) ?
			
 
				-				STARPU_MAX_PRIO : STARPU_MIN_PRIO,
			
 
				-			0);
			
 
				-}
			
 
				-
			
 
				-/*
			
 
				- *	code to bootstrap the factorization 
			
 
				- */
			
 
				-
			
 
				-double STARPU_PLU(plu_main)(unsigned _nblocks, int _rank, int _world_size)
			
 
				-{
			
 
				-	struct timeval start;
			
 
				-	struct timeval end;
			
 
				-
			
 
				-	nblocks = _nblocks;
			
 
				-	rank = _rank;
			
 
				-	world_size = _world_size;
			
 
				-
			
 
				-	/* create all the DAG nodes */
			
 
				-	unsigned i,j,k;
			
 
				-
			
 
				-	starpu_mpi_barrier(MPI_COMM_WORLD);
			
 
				-
			
 
				-	gettimeofday(&start, NULL);
			
 
				-
			
 
				-	for (k = 0; k < nblocks; k++)
			
 
				-	{
			
 
				-		create_task_11(k);
			
 
				-
			
 
				-		for (i = k+1; i<nblocks; i++)
			
 
				-		{
			
 
				-			create_task_12(k, i);
			
 
				-			create_task_21(k, i);
			
 
				-		}
			
 
				-
			
 
				-		for (i = k+1; i<nblocks; i++)
			
 
				-		{
			
 
				-			for (j = k+1; j<nblocks; j++)
			
 
				-			{
			
 
				-				create_task_22(k, i, j);
			
 
				-			}
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	starpu_task_wait_for_all();
			
 
				-
			
 
				-	starpu_mpi_barrier(MPI_COMM_WORLD);
			
 
				-
			
 
				-	gettimeofday(&end, NULL);
			
 
				-
			
 
				-	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
			
 
				-	
			
 
				-//	fprintf(stderr, "RANK %d -> took %f ms\n", rank, timing/1000);
			
 
				-	
			
 
				-	return timing;
			
 
				-}
			
--- a/mpi/src/starpu_mpi_collective.c
+++ b/mpi/src/starpu_mpi_collective.c
@@ -57,6 +57,7 @@ int starpu_mpi_scatter_detached(starpu_data_handle_t *data_handles, int count, i
 
				 		callback_arg->nb = 0;
			
 
				 		callback_arg->callback = (rank == root) ? scallback : rcallback;
			
 
				 		callback_arg->arg = (rank == root) ? sarg : rarg;
			
 
				+		if (callback_arg->callback == NULL)
			
 
				 
			
 
				 		for(x = 0; x < count ; x++)
			
 
				 		{
			
--- a/mpi/tests/Makefile.am
+++ b/mpi/tests/Makefile.am
@@ -198,8 +198,5 @@ mpi_reduction_SOURCES += mpi_reduction_kernels.c
 
				 user_defined_datatype_SOURCES = user_defined_datatype.c
			
 
				 user_defined_datatype_SOURCES += $(top_srcdir)/examples/interface/complex_interface.c
			
 
				 
			
 
				-mpi_earlyrecv2_SOURCES = mpi_earlyrecv2.c
			
 
				-mpi_earlyrecv2_SOURCES += $(top_srcdir)/examples/interface/complex_interface.c
			
 
				-
			
 
				 showcheck:
			
 
				 	-cat $(TEST_LOGS) /dev/null
			
--- a/mpi/tests/mpi_earlyrecv2.c
+++ b/mpi/tests/mpi_earlyrecv2.c
@@ -18,199 +18,81 @@
 
				 #include <starpu_mpi.h>
			
 
				 #include "helper.h"
			
 
				 #include <unistd.h>
			
 
				-#include <interface/complex_interface.h>
			
 
				 
			
 
				+//#define NB 1000
			
 
				 #define NB 10
			
 
				 
			
 
				-static starpu_pthread_mutex_t mutex = STARPU_PTHREAD_MUTEX_INITIALIZER;
			
 
				-static starpu_pthread_cond_t cond = STARPU_PTHREAD_COND_INITIALIZER;
			
 
				-
			
 
				-void callback(void *arg)
			
 
				+int main(int argc, char **argv)
			
 
				 {
			
 
				-	unsigned *received = arg;
			
 
				+	int ret, rank, size, i;
			
 
				+	starpu_data_handle_t tab_handle[NB];
			
 
				+	int value[NB];
			
 
				 
			
 
				-	STARPU_PTHREAD_MUTEX_LOCK(&mutex);
			
 
				-	*received = *received + 1;
			
 
				-	FPRINTF_MPI("Requests %d received\n", *received);
			
 
				-	STARPU_PTHREAD_COND_SIGNAL(&cond);
			
 
				-	STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
			
 
				-}
			
 
				+	MPI_Init(NULL, NULL);
			
 
				+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				+	MPI_Comm_size(MPI_COMM_WORLD, &size);
			
 
				 
			
 
				-typedef void (*check_func)(starpu_data_handle_t handle, int i, int rank, int *error);
			
 
				+	if (size%2 != 0)
			
 
				+	{
			
 
				+		if (rank == 0)
			
 
				+			FPRINTF(stderr, "We need a even number of processes.\n");
			
 
				+
			
 
				+		MPI_Finalize();
			
 
				+		return STARPU_TEST_SKIPPED;
			
 
				+	}
			
 
				+
			
 
				+	ret = starpu_init(NULL);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+	ret = starpu_mpi_init(NULL, NULL, 0);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				+
			
 
				+	for(i=0 ; i<NB ; i++)
			
 
				+	{
			
 
				+		value[i]=i*rank;
			
 
				+		starpu_variable_data_register(&tab_handle[i], STARPU_MAIN_RAM, (uintptr_t)&value[i], sizeof(int));
			
 
				+		starpu_data_set_tag(tab_handle[i], i);
			
 
				+	}
			
 
				 
			
 
				-int exchange(int rank, starpu_data_handle_t *handles, check_func func, int detached)
			
 
				-{
			
 
				 	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
			
 
				-	int i;
			
 
				 
			
 
				 	if (rank%2)
			
 
				 	{
			
 
				-		starpu_mpi_send(handles[0], other_rank, 0, MPI_COMM_WORLD);
			
 
				-		starpu_mpi_send(handles[NB-1], other_rank, NB-1, MPI_COMM_WORLD);
			
 
				+		starpu_mpi_send(tab_handle[0], other_rank, 0, MPI_COMM_WORLD);
			
 
				+		starpu_mpi_send(tab_handle[NB-1], other_rank, NB-1, MPI_COMM_WORLD);
			
 
				 		for(i=1 ; i<NB-1 ; i++)
			
 
				 		{
			
 
				-			starpu_mpi_send(handles[i], other_rank, i, MPI_COMM_WORLD);
			
 
				+			starpu_mpi_send(tab_handle[i], other_rank, i, MPI_COMM_WORLD);
			
 
				 		}
			
 
				-		return 0;
			
 
				 	}
			
 
				 	else
			
 
				 	{
			
 
				-		int ret=0;
			
 
				 		starpu_mpi_req req[NB];
			
 
				-		int received = 0;
			
 
				-
			
 
				-		if (detached)
			
 
				-		{
			
 
				-			starpu_mpi_irecv_detached(handles[0], other_rank, 0, MPI_COMM_WORLD, callback, &received);
			
 
				-		}
			
 
				-		else
			
 
				-		{
			
 
				-			memset(req, 0, NB*sizeof(starpu_mpi_req));
			
 
				-			starpu_mpi_irecv(handles[0], &req[0], other_rank, 0, MPI_COMM_WORLD);
			
 
				-			STARPU_ASSERT(req[0] != NULL);
			
 
				-		}
			
 
				+		memset(req, 0, NB*sizeof(starpu_mpi_req));
			
 
				 
			
 
				+		starpu_mpi_irecv(tab_handle[0], &req[0], other_rank, 0, MPI_COMM_WORLD);
			
 
				+		STARPU_ASSERT(req[0] != NULL);
			
 
				 		// We sleep to make sure that the data for the tag 9 will be received before the recv is posted
			
 
				 		usleep(2000000);
			
 
				 		for(i=1 ; i<NB ; i++)
			
 
				 		{
			
 
				-			if (detached)
			
 
				-			{
			
 
				-				starpu_mpi_irecv_detached(handles[i], other_rank, i, MPI_COMM_WORLD, callback, &received);
			
 
				-			}
			
 
				-			else
			
 
				-			{
			
 
				-				starpu_mpi_irecv(handles[i], &req[i], other_rank, i, MPI_COMM_WORLD);
			
 
				-				STARPU_ASSERT(req[i] != NULL);
			
 
				-			}
			
 
				-		}
			
 
				-
			
 
				-		if (detached)
			
 
				-		{
			
 
				-			STARPU_PTHREAD_MUTEX_LOCK(&mutex);
			
 
				-			while (received != NB)
			
 
				-			{
			
 
				-			     FPRINTF_MPI("Received %d messages\n", received);
			
 
				-			     STARPU_PTHREAD_COND_WAIT(&cond, &mutex);
			
 
				-			}
			
 
				-			STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
			
 
				+			starpu_mpi_irecv(tab_handle[i], &req[i], other_rank, i, MPI_COMM_WORLD);
			
 
				+			STARPU_ASSERT(req[i] != NULL);
			
 
				 		}
			
 
				-		else
			
 
				+		for(i=0 ; i<NB ; i++)
			
 
				 		{
			
 
				-			for(i=0 ; i<NB ; i++)
			
 
				-			{
			
 
				-			     starpu_mpi_wait(&req[i], NULL);
			
 
				-			     func(handles[i], i, rank, &ret);
			
 
				-			}
			
 
				+			starpu_mpi_wait(&req[i], NULL);
			
 
				+			int *rvalue = (int *)starpu_data_get_local_ptr(tab_handle[i]);
			
 
				+			STARPU_ASSERT_MSG(*rvalue==i*other_rank, "Incorrect received value: %d != %d\n", *rvalue, i*other_rank);
			
 
				 		}
			
 
				-		return ret;
			
 
				 	}
			
 
				-}
			
 
				-
			
 
				-void check_variable(starpu_data_handle_t handle, int i, int rank, int *error)
			
 
				-{
			
 
				-	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
			
 
				-
			
 
				-	int *rvalue = (int *)starpu_data_get_local_ptr(handle);
			
 
				-	if (*rvalue != i*other_rank)
			
 
				-	{
			
 
				-		FPRINTF_MPI("Incorrect received value: %d != %d\n", *rvalue, i*other_rank);
			
 
				-		*error = 1;
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-int exchange_variable(int rank, int detached)
			
 
				-{
			
 
				-	int ret, i;
			
 
				-	starpu_data_handle_t tab_handle[NB];
			
 
				-	int value[NB];
			
 
				 
			
 
				-	FPRINTF_MPI("Exchanging variable data with detached=%d\n", detached);
			
 
				-
			
 
				-	for(i=0 ; i<NB ; i++)
			
 
				-	{
			
 
				-		value[i]=i*rank;
			
 
				-		starpu_variable_data_register(&tab_handle[i], STARPU_MAIN_RAM, (uintptr_t)&value[i], sizeof(int));
			
 
				-		starpu_data_set_tag(tab_handle[i], i);
			
 
				-	}
			
 
				-	ret = exchange(rank, tab_handle, check_variable, detached);
			
 
				 	for(i=0 ; i<NB ; i++)
			
 
				 		starpu_data_unregister(tab_handle[i]);
			
 
				 
			
 
				-	return ret;
			
 
				-}
			
 
				-
			
 
				-void check_complex(starpu_data_handle_t handle, int i, int rank, int *error)
			
 
				-{
			
 
				-	double *real = starpu_complex_get_real(handle);
			
 
				-	double *imaginary = starpu_complex_get_imaginary(handle);
			
 
				-
			
 
				-	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
			
 
				-
			
 
				-	if ((*real != ((i*other_rank)+12)) || (*imaginary != ((i*other_rank)+45)))
			
 
				-	{
			
 
				-		FPRINTF_MPI("Incorrect received value: %f != %d || %f != %d\n", *real, ((i*other_rank)+12), *imaginary, ((i*other_rank)+45));
			
 
				-		*error = 1;
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-int exchange_complex(int rank, int detached)
			
 
				-{
			
 
				-	int ret, i;
			
 
				-	starpu_data_handle_t handle[NB];
			
 
				-	double real[NB];
			
 
				-	double imaginary[NB];
			
 
				-
			
 
				-	FPRINTF_MPI("Exchanging complex data with detached=%d\n", detached);
			
 
				-
			
 
				-	for(i=0 ; i<NB ; i++)
			
 
				-	{
			
 
				-		real[i] = (i*rank)+12;
			
 
				-		imaginary[i] = (i*rank)+45;
			
 
				-		starpu_complex_data_register(&handle[i], STARPU_MAIN_RAM, &real[i], &imaginary[i], 1);
			
 
				-		starpu_data_set_tag(handle[i], i);
			
 
				-	}
			
 
				-	ret = exchange(rank, handle, check_complex, detached);
			
 
				-	for(i=0 ; i<NB ; i++)
			
 
				-		starpu_data_unregister(handle[i]);
			
 
				-
			
 
				-	return ret;
			
 
				-}
			
 
				-
			
 
				-int main(int argc, char **argv)
			
 
				-{
			
 
				-	int ret, rank, size;
			
 
				-
			
 
				-	MPI_Init(NULL, NULL);
			
 
				-	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
			
 
				-	MPI_Comm_size(MPI_COMM_WORLD, &size);
			
 
				-
			
 
				-	if (size%2 != 0)
			
 
				-	{
			
 
				-		if (rank == 0)
			
 
				-			FPRINTF(stderr, "We need a even number of processes.\n");
			
 
				-
			
 
				-		MPI_Finalize();
			
 
				-		return STARPU_TEST_SKIPPED;
			
 
				-	}
			
 
				-
			
 
				-	ret = starpu_init(NULL);
			
 
				-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				-	ret = starpu_mpi_init(NULL, NULL, 0);
			
 
				-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
			
 
				-
			
 
				-	ret = exchange_variable(rank, 0);
			
 
				-	if (ret == 0)
			
 
				-		ret = exchange_variable(rank, 1);
			
 
				-	if (ret == 0)
			
 
				-		ret = exchange_complex(rank, 0);
			
 
				-	if (ret == 0)
			
 
				-		ret = exchange_complex(rank, 1);
			
 
				-
			
 
				 	starpu_mpi_shutdown();
			
 
				 	starpu_shutdown();
			
 
				 
			
 
				 	MPI_Finalize();
			
 
				 
			
 
				-	return ret;
			
 
				+	return 0;
			
 
				 }
			
--- a/src/common/barrier_counter.h
+++ b/src/common/barrier_counter.h
@@ -14,6 +14,9 @@
 
				  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				  */
			
 
				 
			
 
				+#ifndef __BARRIER_COUNTER_H__
			
 
				+#define __BARRIER_COUNTER_H__
			
 
				+
			
 
				 #include <common/utils.h>
			
 
				 #include <common/barrier.h>
			
 
				 
			
@@ -37,3 +40,4 @@ int _starpu_barrier_counter_increment_until_full_counter(struct _starpu_barrier_
 
				 
			
 
				 int _starpu_barrier_counter_increment(struct _starpu_barrier_counter *barrier_c);
			
 
				 
			
 
				+#endif
			
--- a/src/core/combined_workers.c
+++ b/src/core/combined_workers.c
@@ -67,9 +67,14 @@ int starpu_combined_worker_assign_workerid(int nworkers, int workerid_array[])
 
				 	{
			
 
				 		int id = workerid_array[i];
			
 
				 
			
 
				+#ifdef STARPU_USE_MIC
			
 
				+		STARPU_ASSERT(config->workers[id].arch == STARPU_CPU_WORKER || config->workers[id].arch == STARPU_MIC_WORKER);
			
 
				+		STARPU_ASSERT(config->workers[id].worker_mask == STARPU_CPU || config->workers[id].worker_mask == STARPU_MIC);
			
 
				+#else/* STARPU_USE_MIC */
			
 
				 		/* We only combine CPUs */
			
 
				-		STARPU_ASSERT(config->workers[id].perf_arch == STARPU_CPU_DEFAULT);
			
 
				+		STARPU_ASSERT(config->workers[id].arch == STARPU_CPU_WORKER);
			
 
				 		STARPU_ASSERT(config->workers[id].worker_mask == STARPU_CPU);
			
 
				+#endif /* STARPU_USE_MIC */
			
 
				 
			
 
				 		/* We only combine valid "basic" workers */
			
 
				 		if ((id < 0) || (id >= basic_worker_count))
			
@@ -95,8 +100,21 @@ int starpu_combined_worker_assign_workerid(int nworkers, int workerid_array[])
 
				 		&config->combined_workers[combined_worker_id];
			
 
				 
			
 
				 	combined_worker->worker_size = nworkers;
			
 
				-	combined_worker->perf_arch = (enum starpu_perfmodel_archtype) (STARPU_CPU_DEFAULT + nworkers - 1);
			
 
				-	combined_worker->worker_mask = STARPU_CPU;
			
 
				+
			
 
				+#ifdef STARPU_USE_MIC
			
 
				+	if(config->workers[workerid_array[0]].worker_mask == STARPU_MIC)
			
 
				+	{
			
 
				+		combined_worker->perf_arch = (enum starpu_perfmodel_archtype) (STARPU_MIC_DEFAULT + config->workers[workerid_array[0]].mp_nodeid /* *STARPU_MAXMICCPUS + nworkers - 1*/);
			
 
				+		combined_worker->worker_mask = STARPU_MIC;
			
 
				+	}
			
 
				+#endif
			
 
				+	if(config->workers[workerid_array[0]].worker_mask == STARPU_CPU)
			
 
				+	{
			
 
				+		combined_worker->perf_arch = (enum starpu_perfmodel_archtype) (STARPU_CPU_DEFAULT + nworkers - 1);
			
 
				+		combined_worker->worker_mask = STARPU_CPU;
			
 
				+	}
			
 
				+	combined_worker->count = nworkers -1;
			
 
				+	pthread_mutex_init(&combined_worker->count_mutex,NULL);
			
 
				 
			
 
				 	/* We assume that the memory node should either be that of the first
			
 
				 	 * entry, and it is very likely that every worker in the combination
			
--- a/src/core/detect_combined_workers.c
+++ b/src/core/detect_combined_workers.c
@@ -191,58 +191,119 @@ static void find_and_assign_combinations_with_hwloc(int *workerids, int nworkers
 
				 
			
 
				 #else /* STARPU_HAVE_HWLOC */
			
 
				 
			
 
				+static void assign_combinations_without_hwloc(struct starpu_worker_collection* worker_collection, int* workers, unsigned n, int min, int max)
			
 
				+{
			
 
				+
			
 
				+	int size,i,count =0;
			
 
				+	//if the maximun number of worker is already reached
			
 
				+	if(worker_collection->nworkers >= STARPU_NMAXWORKERS - 1)
			
 
				+		return;
			
 
				+
			
 
				+	for (size = min; size <= max; size *= 2)
			
 
				+	{
			
 
				+		unsigned first;
			
 
				+		for (first = 0; first < n; first += size)
			
 
				+		{
			
 
				+			if (first + size <= n)
			
 
				+			{
			
 
				+				int found_workerids[size];
			
 
				+
			
 
				+				for (i = 0; i < size; i++)
			
 
				+					found_workerids[i] = workers[first + i];
			
 
				+
			
 
				+				/* We register this combination */
			
 
				+				int newworkerid;
			
 
				+				newworkerid = starpu_combined_worker_assign_workerid(size, found_workerids);
			
 
				+				STARPU_ASSERT(newworkerid >= 0);
			
 
				+				count++;
			
 
				+				worker_collection->add(worker_collection, newworkerid);
			
 
				+				//if the maximun number of worker is reached, then return
			
 
				+				if(worker_collection->nworkers >= STARPU_NMAXWORKERS - 1)
			
 
				+					return;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+
			
 
				 static void find_and_assign_combinations_without_hwloc(int *workerids, int nworkers)
			
 
				 {
			
 
				+	int i;
			
 
				+	unsigned j;
			
 
				 	unsigned sched_ctx_id  = starpu_sched_ctx_get_context();
			
 
				 	if(sched_ctx_id == STARPU_NMAX_SCHED_CTXS)
			
 
				 		sched_ctx_id = 0;
			
 
				-	int min;
			
 
				-	int max;
			
 
				+	int min, max, mic_min, mic_max;
			
 
				 
			
 
				 	struct starpu_worker_collection* workers = starpu_sched_ctx_get_worker_collection(sched_ctx_id);
			
 
				 
			
 
				 	/* We put the id of all CPU workers in this array */
			
 
				 	int cpu_workers[STARPU_NMAXWORKERS];
			
 
				 	unsigned ncpus = 0;
			
 
				+#ifdef STARPU_USE_MIC
			
 
				+	unsigned nb_mics = _starpu_get_machine_config()->topology.nmicdevices;
			
 
				+	unsigned * nmics_table;
			
 
				+	int * mic_id;
			
 
				+	int ** mic_workers;
			
 
				+	mic_id = malloc(sizeof(int)*nb_mics);
			
 
				+	nmics_table = malloc(sizeof(unsigned)*nb_mics);
			
 
				+	mic_workers = malloc(sizeof(int*)*nb_mics);
			
 
				+	for(i=0; i<nb_mics; i++)
			
 
				+	{
			
 
				+		mic_id[i] = -1;
			
 
				+		nmics_table[i] = 0;
			
 
				+		mic_workers[i] = malloc(sizeof(int)*STARPU_NMAXWORKERS);
			
 
				+	}
			
 
				+#endif /* STARPU_USE_MIC */
			
 
				 
			
 
				 	struct _starpu_worker *worker;
			
 
				-	int i;
			
 
				 	for (i = 0; i < nworkers; i++)
			
 
				 	{
			
 
				 		worker = _starpu_get_worker_struct(workerids[i]);
			
 
				-
			
 
				-		if (worker->perf_arch == STARPU_CPU_DEFAULT)
			
 
				+		if (worker->arch == STARPU_CPU_WORKER)
			
 
				 			cpu_workers[ncpus++] = i;
			
 
				+#ifdef STARPU_USE_MIC
			
 
				+		else if(worker->arch == STARPU_MIC_WORKER)
			
 
				+		{
			
 
				+			for(j=0; mic_id[j] != worker->mp_nodeid && mic_id[j] != -1 && j<nb_mics; j++);
			
 
				+			if(j<nb_mics)
			
 
				+			{
			
 
				+				if(mic_id[j] == -1)
			
 
				+				{
			
 
				+					mic_id[j] = worker->mp_nodeid;					
			
 
				+				}
			
 
				+				mic_workers[j][nmics_table[j]++] = i;
			
 
				+			}
			
 
				+		}
			
 
				+#endif /* STARPU_USE_MIC */
			
 
				+
			
 
				 	}
			
 
				 
			
 
				+
			
 
				 	min = starpu_get_env_number("STARPU_MIN_WORKERSIZE");
			
 
				 	if (min < 2)
			
 
				 		min = 2;
			
 
				 	max = starpu_get_env_number("STARPU_MAX_WORKERSIZE");
			
 
				 	if (max == -1 || max > (int) ncpus)
			
 
				 		max = ncpus;
			
 
				-
			
 
				-	int size;
			
 
				-	for (size = min; size <= max; size *= 2)
			
 
				+	
			
 
				+	assign_combinations_without_hwloc(workers,cpu_workers,ncpus,min,max);
			
 
				+#ifdef STARPU_USE_MIC
			
 
				+	mic_min = starpu_get_env_number("STARPU_MIN_WORKERSIZE");
			
 
				+	if (mic_min < 2)
			
 
				+		mic_min = 2;
			
 
				+	for(i=0; i<nb_mics; i++)
			
 
				 	{
			
 
				-		unsigned first_cpu;
			
 
				-		for (first_cpu = 0; first_cpu < ncpus; first_cpu += size)
			
 
				-		{
			
 
				-			if (first_cpu + size <= ncpus)
			
 
				-			{
			
 
				-				int found_workerids[size];
			
 
				-
			
 
				-				for (i = 0; i < size; i++)
			
 
				-					found_workerids[i] = cpu_workers[first_cpu + i];
			
 
				-
			
 
				-				/* We register this combination */
			
 
				-				int newworkerid;
			
 
				-				newworkerid = starpu_combined_worker_assign_workerid(size, found_workerids);
			
 
				-				STARPU_ASSERT(newworkerid >= 0);
			
 
				-				workers->add(workers, newworkerid);
			
 
				-			}
			
 
				-		}
			
 
				+		mic_max = starpu_get_env_number("STARPU_MAX_WORKERSIZE");
			
 
				+		if (mic_max == -1 || mic_max > (int) nmics_table[i])
			
 
				+			mic_max = nmics_table[i];
			
 
				+		assign_combinations_without_hwloc(workers,mic_workers[i],nmics_table[i],mic_min,mic_max);
			
 
				+		free(mic_workers[i]);
			
 
				 	}
			
 
				+	free(mic_id);
			
 
				+	free(nmics_table);
			
 
				+	free(mic_workers);
			
 
				+#endif /* STARPU_USE_MIC */
			
 
				 }
			
 
				 
			
 
				 #endif /* STARPU_HAVE_HWLOC */
			
--- a/src/core/jobs.c
+++ b/src/core/jobs.c
@@ -129,9 +129,12 @@ void _starpu_wait_job(struct _starpu_job *j)
 
				 	 * way, _starpu_wait_job won't return until the entire task was really
			
 
				 	 * executed (so that we cannot destroy the task while it is still being
			
 
				 	 * manipulated by the driver). */
			
 
				+
			
 
				 	while (j->terminated != 2)
			
 
				+	{
			
 
				 		STARPU_PTHREAD_COND_WAIT(&j->sync_cond, &j->sync_mutex);
			
 
				-
			
 
				+	}
			
 
				+	
			
 
				 	STARPU_PTHREAD_MUTEX_UNLOCK(&j->sync_mutex);
			
 
				         _STARPU_LOG_OUT();
			
 
				 }
			
--- a/src/core/sched_policy.c
+++ b/src/core/sched_policy.c
@@ -595,7 +595,6 @@ pick:
 
				 			else
			
 
				 				sched_ctx = _get_next_sched_ctx_to_pop_into(worker);
			
 
				 
			
 
				-
			
 
				 			if(sched_ctx && sched_ctx->id != STARPU_NMAX_SCHED_CTXS)
			
 
				 			{
			
 
				 				if (sched_ctx->sched_policy && sched_ctx->sched_policy->pop_task)
			
--- a/src/core/topology.c
+++ b/src/core/topology.c
@@ -682,8 +682,8 @@ _starpu_init_mp_config (struct _starpu_machine_config *config,
 
				 		if (0 == _starpu_init_mic_node (config, i, &handles[i], &process[i]))
			
 
				 			topology->nmicdevices++;
			
 
				 
			
 
				-	i = 0;
			
 
				-	for (; i < topology->nmicdevices; i++)
			
 
				+	
			
 
				+	for (i = 0; i < topology->nmicdevices; i++)
			
 
				 		_starpu_init_mic_config (config, user_conf, i);
			
 
				 #endif
			
 
				 }
			
@@ -1094,6 +1094,9 @@ _starpu_bind_thread_on_cpus (
 
				 		}
			
 
				 	}
			
 
				 #else
			
 
				+#ifdef __GLIBC__
			
 
				+	pthread_setaffinity_np(pthread_self(),sizeof(cpu_set_t),&combined_worker->cpu_set);
			
 
				+#endif
			
 
				 #warning no parallel worker CPU binding support
			
 
				 #endif
			
 
				 }
			
@@ -1260,6 +1263,7 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 
				 
			
 
				 		if (is_a_set_of_accelerators)
			
 
				 		{
			
 
				+/* TODO: il faudrait changer quand on change de device */
			
 
				 			if (accelerator_bindid == -1)
			
 
				 				accelerator_bindid = _starpu_get_next_bindid(config, preferred_binding, npreferred);
			
 
				 
			
--- a/src/core/workers.c
+++ b/src/core/workers.c
@@ -293,10 +293,15 @@ int starpu_combined_worker_can_execute_task(unsigned workerid, struct starpu_tas
 
				 	}
			
 
				 	else
			
 
				 	{
			
 
				-		if ((cl->type == STARPU_SPMD)
			
 
				+		if ((cl->type == STARPU_SPMD) 
			
 
				 #ifdef STARPU_HAVE_HWLOC
			
 
				 				|| (cl->type == STARPU_FORKJOIN)
			
 
				+#else
			
 
				+#ifdef __GLIBC__
			
 
				+				|| (cl->type == STARPU_FORKJOIN)
			
 
				+#endif
			
 
				 #endif
			
 
				+
			
 
				 				)
			
 
				 		{
			
 
				 			/* TODO we should add other types of constraints */
			
@@ -434,7 +439,6 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 
				 #ifdef HAVE_AYUDAME_H
			
 
				 	if (AYU_event) AYU_event(AYU_INIT, 0, NULL);
			
 
				 #endif
			
 
				-
			
 
				 	for (worker = 0; worker < nworkers; worker++)
			
 
				 	{
			
 
				 		struct _starpu_worker *workerarg = &pconfig->workers[worker];
			
@@ -1418,13 +1422,15 @@ unsigned starpu_worker_is_combined_worker(int id)
 
				 
			
 
				 struct _starpu_sched_ctx *_starpu_get_sched_ctx_struct(unsigned id)
			
 
				 {
			
 
				-	if (id == STARPU_NMAX_SCHED_CTXS) return NULL;
			
 
				+	if(id == STARPU_NMAX_SCHED_CTXS) return NULL;
			
 
				 	return &config.sched_ctxs[id];
			
 
				 }
			
 
				 
			
 
				 struct _starpu_combined_worker *_starpu_get_combined_worker_struct(unsigned id)
			
 
				 {
			
 
				 	unsigned basic_worker_count = starpu_worker_get_count();
			
 
				+	
			
 
				+	//_STARPU_DEBUG("basic_worker_count:%d\n",basic_worker_count);
			
 
				 
			
 
				 	STARPU_ASSERT(id >= basic_worker_count);
			
 
				 	return &config.combined_workers[id - basic_worker_count];
			
--- a/src/core/workers.h
+++ b/src/core/workers.h
@@ -123,6 +123,10 @@ struct _starpu_combined_worker
 
				 	int worker_size;
			
 
				 	unsigned memory_node; /* which memory node is associated that worker to ? */
			
 
				 	int combined_workerid[STARPU_NMAXWORKERS];
			
 
				+#ifdef STARPU_USE_MIC 
			
 
				+	int count;
			
 
				+	pthread_mutex_t count_mutex;
			
 
				+#endif
			
 
				 
			
 
				 #ifdef __GLIBC__
			
 
				 	cpu_set_t cpu_set;
			
--- a/src/datawizard/data_request.c
+++ b/src/datawizard/data_request.c
@@ -358,6 +358,7 @@ static int starpu_handle_data_request(struct _starpu_data_request *r, unsigned m
 
				 	/* perform the transfer */
			
 
				 	/* the header of the data must be locked by the worker that submitted the request */
			
 
				 
			
 
				+
			
 
				 	r->retval = _starpu_driver_copy_data_1_to_1(handle, src_replicate,
			
 
				 						    dst_replicate, !(r_mode & STARPU_R), r, may_alloc);
			
 
				 
			
@@ -387,7 +388,7 @@ static int starpu_handle_data_request(struct _starpu_data_request *r, unsigned m
 
				 	/* the request has been handled */
			
 
				 	_starpu_spin_lock(&r->lock);
			
 
				 	starpu_handle_data_request_completion(r);
			
 
				-
			
 
				+_STARPU_DEBUG("MEMORY!\n");
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
--- a/src/datawizard/filters.c
+++ b/src/datawizard/filters.c
@@ -258,9 +258,7 @@ void starpu_data_partition(starpu_data_handle_t initial_handle, struct starpu_da
 
				 		void *ptr;
			
 
				 		ptr = starpu_data_handle_to_pointer(child, 0);
			
 
				 		if (ptr != NULL)
			
 
				-		{
			
 
				 			_starpu_data_register_ram_pointer(child, ptr);
			
 
				-		}
			
 
				 	}
			
 
				 	/* now let the header */
			
 
				 	_starpu_spin_unlock(&initial_handle->header_lock);
			
--- a/src/datawizard/reduction.c
+++ b/src/datawizard/reduction.c
@@ -86,18 +86,16 @@ void _starpu_redux_init_data_replicate(starpu_data_handle_t handle, struct _star
 
				 #ifdef STARPU_USE_MIC
			
 
				 	if (starpu_worker_get_type(workerid) == STARPU_MIC_WORKER)
			
 
				 	{
			
 
				-		const struct _starpu_mp_node *node = _starpu_mic_src_get_actual_thread_mp_node();
			
 
				-		enum _starpu_mp_command answer;
			
 
				-		void *arg = NULL;
			
 
				-		int arg_size = 0;
			
 
				-
			
 
				-		// XXX: give the correct coreid.
			
 
				+		struct _starpu_mp_node *node = _starpu_mic_src_get_actual_thread_mp_node();
			
 
				+		int devid = _starpu_get_worker_struct(workerid)->devid;
			
 
				+		void * arg;
			
 
				+		int arg_size;
			
 
				 		_starpu_src_common_execute_kernel(node,
			
 
				-						  (void(*)(void))init_func, 0,
			
 
				-						  &handle, &(replicate->data_interface), 1,
			
 
				-						  NULL, 0);
			
 
				-		answer = _starpu_mp_common_recv_command (node, &arg, &arg_size);
			
 
				-		STARPU_ASSERT (answer == STARPU_EXECUTION_COMPLETED);
			
 
				+						 (void(*)(void))init_func, devid,
			
 
				+						 STARPU_SEQ, 0, 0, &handle, 
			
 
				+						 &(replicate->data_interface), 1,
			
 
				+						 NULL, 0);
			
 
				+		_starpu_src_common_wait_completed_execution(node,devid,&arg,&arg_size);
			
 
				 	}
			
 
				 	else
			
 
				 #endif
			
--- a/src/drivers/driver_common/driver_common.c
+++ b/src/drivers/driver_common/driver_common.c
@@ -95,7 +95,7 @@ void _starpu_driver_end_job(struct _starpu_worker *args, struct _starpu_job *j,
 
				 	}
			
 
				 
			
 
				 	if (starpu_top)
			
 
				-	  _starpu_top_task_ended(task,workerid,codelet_end);
			
 
				+		_starpu_top_task_ended(task,workerid,codelet_end);
			
 
				 
			
 
				 	args->status = STATUS_UNKNOWN;
			
 
				 }
			
@@ -129,9 +129,9 @@ void _starpu_driver_update_job_feedback(struct _starpu_job *j, struct _starpu_wo
 
				 			profiling_info->workerid = workerid;
			
 
				 
			
 
				 			_starpu_worker_update_profiling_info_executing(workerid, &measured_ts, 1,
			
 
				-				profiling_info->used_cycles,
			
 
				-				profiling_info->stall_cycles,
			
 
				-				profiling_info->power_consumed);
			
 
				+								       profiling_info->used_cycles,
			
 
				+								       profiling_info->stall_cycles,
			
 
				+								       profiling_info->power_consumed);
			
 
				 			updated =  1;
			
 
				 		}
			
 
				 
			
@@ -150,6 +150,29 @@ void _starpu_driver_update_job_feedback(struct _starpu_job *j, struct _starpu_wo
 
				 	}
			
 
				 }
			
 
				 
			
 
				+
			
 
				+
			
 
				+static void _starpu_worker_set_status_sleeping(int workerid)
			
 
				+{
			
 
				+	if (_starpu_worker_get_status(workerid) != STATUS_SLEEPING)
			
 
				+	{
			
 
				+		_STARPU_TRACE_WORKER_SLEEP_START;
			
 
				+		_starpu_worker_restart_sleeping(workerid);
			
 
				+		_starpu_worker_set_status(workerid, STATUS_SLEEPING);
			
 
				+	}
			
 
				+
			
 
				+}
			
 
				+
			
 
				+static void _starpu_worker_set_status_wakeup(int workerid)
			
 
				+{
			
 
				+	if (_starpu_worker_get_status(workerid) == STATUS_SLEEPING)
			
 
				+	{
			
 
				+		_STARPU_TRACE_WORKER_SLEEP_END;
			
 
				+		_starpu_worker_stop_sleeping(workerid);
			
 
				+		_starpu_worker_set_status(workerid, STATUS_UNKNOWN);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				 /* Workers may block when there is no work to do at all. */
			
 
				 struct starpu_task *_starpu_get_worker_task(struct _starpu_worker *args, int workerid, unsigned memnode)
			
 
				 {
			
@@ -175,12 +198,7 @@ struct starpu_task *_starpu_get_worker_task(struct _starpu_worker *args, int wor
 
				 		 * driver may go block just after the scheduler got a new task to be
			
 
				 		 * executed, and thus hanging. */
			
 
				 
			
 
				-		if (_starpu_worker_get_status(workerid) != STATUS_SLEEPING)
			
 
				-		{
			
 
				-			_STARPU_TRACE_WORKER_SLEEP_START;
			
 
				-			_starpu_worker_restart_sleeping(workerid);
			
 
				-			_starpu_worker_set_status(workerid, STATUS_SLEEPING);
			
 
				-		}
			
 
				+		_starpu_worker_set_status_sleeping(workerid);
			
 
				 
			
 
				 		if (_starpu_worker_can_block(memnode))
			
 
				 			STARPU_PTHREAD_COND_WAIT(&args->sched_cond, &args->sched_mutex);
			
@@ -235,12 +253,7 @@ struct starpu_task *_starpu_get_worker_task(struct _starpu_worker *args, int wor
 
				 		perf_counters->notify_idle_end(task->sched_ctx, args->workerid);
			
 
				 #endif //STARPU_USE_SC_HYPERVISOR
			
 
				 
			
 
				-	if (_starpu_worker_get_status(workerid) == STATUS_SLEEPING)
			
 
				-	{
			
 
				-		_STARPU_TRACE_WORKER_SLEEP_END;
			
 
				-		_starpu_worker_stop_sleeping(workerid);
			
 
				-		_starpu_worker_set_status(workerid, STATUS_UNKNOWN);
			
 
				-	}
			
 
				+	_starpu_worker_set_status_wakeup(workerid);
			
 
				 
			
 
				 #ifdef HAVE_AYUDAME_H
			
 
				 	if (AYU_event)
			
@@ -252,3 +265,62 @@ struct starpu_task *_starpu_get_worker_task(struct _starpu_worker *args, int wor
 
				 
			
 
				 	return task;
			
 
				 }
			
 
				+
			
 
				+
			
 
				+int _starpu_get_multi_worker_task(struct _starpu_worker *workers, struct starpu_task ** tasks, int nworkers)
			
 
				+{
			
 
				+	int i, count = 0;
			
 
				+	struct _starpu_job * j;
			
 
				+	int is_parallel_task;
			
 
				+	struct _starpu_combined_worker *combined_worker;
			
 
				+	/*for each worker*/
			
 
				+	for (i = 0; i < nworkers; i++)
			
 
				+	{
			
 
				+		/*if the worker is already executinf a task then */
			
 
				+		if(workers[i].current_task)
			
 
				+		{
			
 
				+			tasks[i] = NULL;
			
 
				+		}
			
 
				+		/*else try to pop a task*/
			
 
				+		else
			
 
				+		{
			
 
				+			STARPU_PTHREAD_MUTEX_LOCK(&workers[i].sched_mutex);
			
 
				+			_starpu_set_local_worker_key(&workers[i]);
			
 
				+			tasks[i] = _starpu_pop_task(&workers[i]);
			
 
				+			STARPU_PTHREAD_MUTEX_UNLOCK(&workers[i].sched_mutex);
			
 
				+			if(tasks[i] != NULL)
			
 
				+			{
			
 
				+				count ++;
			
 
				+				j = _starpu_get_job_associated_to_task(tasks[i]);
			
 
				+				is_parallel_task = (j->task_size > 1);
			
 
				+				workers[i].current_task = j->task;
			
 
				+				/* Get the rank in case it is a parallel task */
			
 
				+				if (is_parallel_task)
			
 
				+				{
			
 
				+
			
 
				+					STARPU_PTHREAD_MUTEX_LOCK(&j->sync_mutex);
			
 
				+					workers[i].current_rank = j->active_task_alias_count++;
			
 
				+					STARPU_PTHREAD_MUTEX_UNLOCK(&j->sync_mutex);
			
 
				+					
			
 
				+					combined_worker = _starpu_get_combined_worker_struct(j->combined_workerid);
			
 
				+					workers[i].combined_workerid = j->combined_workerid;
			
 
				+					workers[i].worker_size = combined_worker->worker_size;
			
 
				+				}
			
 
				+				else
			
 
				+				{
			
 
				+					workers[i].combined_workerid = workers[i].workerid;
			
 
				+					workers[i].worker_size = 1;
			
 
				+					workers[i].current_rank = 0;
			
 
				+				}
			
 
				+
			
 
				+				_starpu_worker_set_status_wakeup(workers[i].workerid);
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+				_starpu_worker_set_status_sleeping(workers[i].workerid);
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	return count;
			
 
				+}
			
 
				+
			
--- a/src/drivers/driver_common/driver_common.h
+++ b/src/drivers/driver_common/driver_common.h
@@ -32,5 +32,5 @@ void _starpu_driver_update_job_feedback(struct _starpu_job *j, struct _starpu_wo
 
				 					struct timespec *codelet_start, struct timespec *codelet_end, int profiling);
			
 
				 
			
 
				 struct starpu_task *_starpu_get_worker_task(struct _starpu_worker *args, int workerid, unsigned memnode);
			
 
				-
			
 
				+int _starpu_get_multi_worker_task(struct _starpu_worker *workers, struct starpu_task ** tasks, int nworker);
			
 
				 #endif // __DRIVER_COMMON_H__
			
--- a/src/drivers/mic/driver_mic_common.c
+++ b/src/drivers/mic/driver_mic_common.c
@@ -19,7 +19,6 @@
 
				 #include <drivers/mp_common/mp_common.h>
			
 
				 #include <drivers/mic/driver_mic_common.h>
			
 
				 
			
 
				-
			
 
				 void _starpu_mic_common_report_scif_error(const char *func, const char *file, const int line, const int status)
			
 
				 {
			
 
				 	const char *errormsg = strerror(status);
			
@@ -33,10 +32,25 @@ void _starpu_mic_common_report_scif_error(const char *func, const char *file, co
 
				 
			
 
				 void _starpu_mic_common_send(const struct _starpu_mp_node *node, void *msg, int len)
			
 
				 {
			
 
				-	if ((scif_send(node->mp_connection.mic_endpoint, msg, len, SCIF_SEND_BLOCK)) < 0)
			
 
				+  if ((scif_send(node->mp_connection.mic_endpoint, msg, len, SCIF_SEND_BLOCK)) < 0)
			
 
				 		STARPU_MP_COMMON_REPORT_ERROR(node, errno);
			
 
				 }
			
 
				 
			
 
				+
			
 
				+/* Teel is the mic endpoint is ready
			
 
				+ * return 1 if a message has been receive, 0 if no message has been receive
			
 
				+ */
			
 
				+int _starpu_mic_common_recv_is_ready(const struct _starpu_mp_node *mp_node)
			
 
				+{
			
 
				+  struct scif_pollepd pollepd;
			
 
				+  pollepd.epd = mp_node->mp_connection.mic_endpoint;
			
 
				+  pollepd.events = SCIF_POLLIN;
			
 
				+  pollepd.revents = 0;
			
 
				+  return  scif_poll(&pollepd,1,0);
			
 
				+	
			
 
				+}
			
 
				+
			
 
				+
			
 
				 /* Handles the error so the caller (which must be generic) doesn't have to
			
 
				  * care about it.
			
 
				  */
			
@@ -49,7 +63,7 @@ void _starpu_mic_common_recv(const struct _starpu_mp_node *node, void *msg, int
 
				 
			
 
				 /* Handles the error so the caller (which must be generic) doesn't have to
			
 
				  * care about it.
			
 
				- */
			
 
				+x */
			
 
				 void _starpu_mic_common_dt_send(const struct _starpu_mp_node *mp_node, void *msg, int len)
			
 
				 {
			
 
				 	if ((scif_send(mp_node->host_sink_dt_connection.mic_endpoint, msg, len, SCIF_SEND_BLOCK)) < 0)
			
@@ -114,7 +128,7 @@ void _starpu_mic_common_accept(scif_epd_t *endpoint, uint16_t port_number)
 
				 	_STARPU_DEBUG("MIC accepting connection on %u...\n", port_number);
			
 
				 	if ((scif_accept(init_epd, &portID, endpoint, SCIF_ACCEPT_SYNC)) < 0)
			
 
				 		STARPU_MIC_COMMON_REPORT_SCIF_ERROR(errno);
			
 
				-	_STARPU_DEBUG("done\n", init_epd);
			
 
				+	_STARPU_DEBUG("done : %d\n", init_epd);
			
 
				 
			
 
				 	scif_close(init_epd);
			
 
				 }
			
--- a/src/drivers/mic/driver_mic_common.h
+++ b/src/drivers/mic/driver_mic_common.h
@@ -53,6 +53,8 @@ struct _starpu_mic_free_command
 
				 
			
 
				 void _starpu_mic_common_report_scif_error(const char *func, const char *file, int line, const int status);
			
 
				 
			
 
				+int _starpu_mic_common_recv_is_ready(const struct _starpu_mp_node *mp_node);
			
 
				+
			
 
				 void _starpu_mic_common_send(const struct _starpu_mp_node *node, void *msg, int len);
			
 
				 
			
 
				 void _starpu_mic_common_recv(const struct _starpu_mp_node *node, void *msg, int len);
			
--- a/src/drivers/mic/driver_mic_sink.c
+++ b/src/drivers/mic/driver_mic_sink.c
@@ -16,12 +16,14 @@
 
				 
			
 
				 
			
 
				 #include <errno.h>
			
 
				+#include <dlfcn.h>
			
 
				 
			
 
				 #include <common/COISysInfo_common.h>
			
 
				 
			
 
				 #include <starpu.h>
			
 
				 #include <drivers/mp_common/mp_common.h>
			
 
				 #include <drivers/mp_common/sink_common.h>
			
 
				+#include <datawizard/interfaces/data_interface.h>
			
 
				 
			
 
				 #include "driver_mic_common.h"
			
 
				 #include "driver_mic_sink.h"
			
@@ -29,17 +31,27 @@
 
				 /* Initialize the MIC sink, initializing connection to the source
			
 
				  * and to the other devices (not implemented yet).
			
 
				  */
			
 
				-
			
 
				 void _starpu_mic_sink_init(struct _starpu_mp_node *node)
			
 
				 {
			
 
				-	//unsigned int i;
			
 
				-	
			
 
				+	pthread_t self;
			
 
				+	cpu_set_t cpuset;
			
 
				+
			
 
				+	/*Bind on the first core*/
			
 
				+	self = pthread_self();
			
 
				+	CPU_ZERO(&cpuset);
			
 
				+	CPU_SET(241,&cpuset);
			
 
				+	pthread_setaffinity_np(self,sizeof(cpu_set_t),&cpuset);
			
 
				+
			
 
				+
			
 
				 	/* Initialize connection with the source */
			
 
				 	_starpu_mic_common_accept(&node->mp_connection.mic_endpoint,
			
 
				 					 STARPU_MIC_SOURCE_PORT_NUMBER);
			
 
				 
			
 
				 	_starpu_mic_common_accept(&node->host_sink_dt_connection.mic_endpoint,
			
 
				 									 STARPU_MIC_SOURCE_DT_PORT_NUMBER);
			
 
				+	
			
 
				+	node->nb_cores = COISysGetHardwareThreadCount() - COISysGetHardwareThreadCount() / COISysGetCoreCount();
			
 
				+	node->thread_table = malloc(sizeof(pthread_t)*node->nb_cores);
			
 
				 
			
 
				 	//node->sink_sink_dt_connections = malloc(node->nb_mp_sinks * sizeof(union _starpu_mp_connection));
			
 
				 
			
@@ -54,11 +66,58 @@ void _starpu_mic_sink_init(struct _starpu_mp_node *node)
 
				 	//								STARPU_MIC_SINK_SINK_DT_PORT_NUMBER(node->devid, i));
			
 
				 }
			
 
				 
			
 
				-/* Deinitialize the MIC sink, close all the connections.
			
 
				+/* Launch all workers on the mic
			
 
				  */
			
 
				+void _starpu_mic_sink_launch_workers(struct _starpu_mp_node *node)
			
 
				+{
			
 
				+	int i, ret;
			
 
				+	struct arg_sink_thread * arg;
			
 
				+	cpu_set_t cpuset;
			
 
				+	pthread_attr_t attr;
			
 
				+	pthread_t thread;
			
 
				+	
			
 
				+	/*for each core init the mutex, the task pointer and launch the thread */
			
 
				+	for(i=0; i<node->nb_cores; i++)
			
 
				+	{
			
 
				+		//init the set
			
 
				+		CPU_ZERO(&cpuset);
			
 
				+		CPU_SET(i,&cpuset);
			
 
				+
			
 
				+		ret = pthread_attr_init(&attr);
			
 
				+		STARPU_ASSERT(ret == 0);
			
 
				+		ret = pthread_attr_setaffinity_np(&attr, sizeof(cpu_set_t), &cpuset);
			
 
				+		STARPU_ASSERT(ret == 0);
			
 
				+
			
 
				+		/*prepare the argument for the thread*/
			
 
				+		arg= malloc(sizeof(struct arg_sink_thread));
			
 
				+		arg->coreid = i;
			
 
				+		arg->node = node;
			
 
				+		
			
 
				+		ret = pthread_create(&thread, &attr, _starpu_sink_thread, arg);
			
 
				+		STARPU_ASSERT(ret == 0);
			
 
				+		((pthread_t *)node->thread_table)[i] = thread;
			
 
				+	}
			
 
				+
			
 
				+}
			
 
				 
			
 
				+/* Deinitialize the MIC sink, close all the connections.
			
 
				+ */
			
 
				 void _starpu_mic_sink_deinit(struct _starpu_mp_node *node)
			
 
				 {
			
 
				+	
			
 
				+	int i;
			
 
				+	node->is_running = 0;
			
 
				+	for(i=0; i<node->nb_cores; i++)
			
 
				+	{
			
 
				+		sem_post(&node->sem_run_table[i]);
			
 
				+		pthread_join(((pthread_t *)node->thread_table)[i],NULL);
			
 
				+	}
			
 
				+
			
 
				+	free(node->thread_table);
			
 
				+
			
 
				+	scif_close(node->host_sink_dt_connection.mic_endpoint);
			
 
				+	scif_close(node->mp_connection.mic_endpoint);
			
 
				+
			
 
				 	//unsigned int i;
			
 
				 
			
 
				 	//for (i = 0; i < node->nb_mp_sinks; ++i)
			
@@ -69,14 +128,11 @@ void _starpu_mic_sink_deinit(struct _starpu_mp_node *node)
 
				 
			
 
				 	//free(node->sink_sink_dt_connections);
			
 
				 
			
 
				-	scif_close(node->host_sink_dt_connection.mic_endpoint);
			
 
				-	scif_close(node->mp_connection.mic_endpoint);
			
 
				 }
			
 
				 
			
 
				 /* Report an error which occured when using a MIC device
			
 
				  * and print this error in a human-readable style
			
 
				  */
			
 
				-
			
 
				 void _starpu_mic_sink_report_error(const char *func, const char *file, const int line, const int status)
			
 
				 {
			
 
				 	const char *errormsg = strerror(status);
			
@@ -84,13 +140,6 @@ void _starpu_mic_sink_report_error(const char *func, const char *file, const int
 
				 	STARPU_ASSERT(0);
			
 
				 }
			
 
				 
			
 
				-/* Return the number of cores on the callee, a MIC device or Processor Xeon
			
 
				- */
			
 
				-unsigned int _starpu_mic_sink_get_nb_core(void)
			
 
				-{
			
 
				-	return (unsigned int) COISysGetCoreCount();
			
 
				-}
			
 
				-
			
 
				 /* Allocate memory on the MIC.
			
 
				  * Memory is register for remote direct access. */
			
 
				 void _starpu_mic_sink_allocate(const struct _starpu_mp_node *mp_node, void *arg, int arg_size)
			
@@ -133,3 +182,28 @@ void _starpu_mic_sink_free(const struct _starpu_mp_node *mp_node STARPU_ATTRIBUT
 
				 #endif
			
 
				 	free(addr);
			
 
				 }
			
 
				+
			
 
				+
			
 
				+/* bind the thread to a core
			
 
				+ */
			
 
				+void _starpu_mic_sink_bind_thread(const struct _starpu_mp_node *mp_node STARPU_ATTRIBUTE_UNUSED, int coreid, int * core_table, int nb_core)
			
 
				+{
			
 
				+	cpu_set_t cpuset;
			
 
				+	int i;
			
 
				+
			
 
				+  	//init the set
			
 
				+	CPU_ZERO(&cpuset);
			
 
				+
			
 
				+	//adding the core to the set
			
 
				+	for(i=0;i<nb_core;i++)
			
 
				+		CPU_SET(core_table[i],&cpuset);
			
 
				+
			
 
				+	pthread_setaffinity_np(((pthread_t*)mp_node->thread_table)[coreid],sizeof(cpu_set_t),&cpuset);
			
 
				+}
			
 
				+
			
 
				+void (*_starpu_mic_sink_lookup (const struct _starpu_mp_node * node STARPU_ATTRIBUTE_UNUSED, char* func_name))(void)
			
 
				+{
			
 
				+	void *dl_handle = dlopen(NULL, RTLD_NOW);
			
 
				+	return dlsym(dl_handle, func_name);
			
 
				+}
			
 
				+
			
--- a/src/drivers/mic/driver_mic_sink.h
+++ b/src/drivers/mic/driver_mic_sink.h
@@ -34,13 +34,15 @@
 
				 void _starpu_mic_sink_report_error(const char *func, const char *file, const int line, const int status);
			
 
				 
			
 
				 void _starpu_mic_sink_init(struct _starpu_mp_node *node);
			
 
				-
			
 
				+void _starpu_mic_sink_launch_workers(struct _starpu_mp_node *node);
			
 
				 void _starpu_mic_sink_deinit(struct _starpu_mp_node *node);
			
 
				 
			
 
				-unsigned int _starpu_mic_sink_get_nb_core(void);
			
 
				-
			
 
				 void _starpu_mic_sink_allocate(const struct _starpu_mp_node *mp_node, void *arg, int arg_size);
			
 
				 void _starpu_mic_sink_free(const struct _starpu_mp_node *mp_node STARPU_ATTRIBUTE_UNUSED, void *arg, int arg_size);
			
 
				+void _starpu_mic_sink_bind_thread(const struct _starpu_mp_node *mp_node STARPU_ATTRIBUTE_UNUSED, int coreid, int * core_table, int nb_core);
			
 
				+
			
 
				+void (*_starpu_mic_sink_lookup (const struct _starpu_mp_node * node STARPU_ATTRIBUTE_UNUSED,
			
 
				+			char* func_name))(void);
			
 
				 
			
 
				 #endif /* STARPU_USE_MIC */
			
 
				 
			
--- a/src/drivers/mic/driver_mic_source.c
+++ b/src/drivers/mic/driver_mic_source.c
@@ -73,7 +73,7 @@ starpu_pthread_mutex_t nb_mic_worker_init_mutex = PTHREAD_MUTEX_INITIALIZER;
 
				 //	return config->workers[workerid].devid;
			
 
				 //}
			
 
				 
			
 
				-const struct _starpu_mp_node *_starpu_mic_src_get_actual_thread_mp_node()
			
 
				+struct _starpu_mp_node *_starpu_mic_src_get_actual_thread_mp_node()
			
 
				 {
			
 
				 	struct _starpu_worker *actual_worker = _starpu_get_local_worker_key();
			
 
				 	STARPU_ASSERT(actual_worker);
			
@@ -110,96 +110,6 @@ void _starpu_mic_clear_kernels(void)
 
				 	}
			
 
				 }
			
 
				 
			
 
				-static int
			
 
				-_starpu_mic_src_finalize_job (struct _starpu_job *j, struct _starpu_worker *worker)
			
 
				-{
			
 
				-	uint32_t mask = 0;
			
 
				-	int profiling = starpu_profiling_status_get();
			
 
				-	struct timespec codelet_end;
			
 
				-
			
 
				-	_starpu_driver_end_job(worker, j, worker->perf_arch, &codelet_end, 0,
			
 
				-			       profiling);
			
 
				-
			
 
				-	_starpu_driver_update_job_feedback(j, worker, worker->perf_arch,
			
 
				-					   &j->cl_start, &codelet_end,
			
 
				-					   profiling);
			
 
				-
			
 
				-	_starpu_push_task_output (j, mask);
			
 
				-
			
 
				-	_starpu_handle_job_termination(j);
			
 
				-
			
 
				-	return 0;
			
 
				-}
			
 
				-
			
 
				-static int
			
 
				-_starpu_mic_src_process_completed_job (struct _starpu_worker_set *workerset)
			
 
				-{
			
 
				-	struct _starpu_mp_node *node = mic_nodes[workerset->workers[0].mp_nodeid];
			
 
				-	enum _starpu_mp_command answer;
			
 
				-	void *arg;
			
 
				-	int arg_size;
			
 
				-
			
 
				-	answer = _starpu_mp_common_recv_command (node, &arg, &arg_size);
			
 
				-	STARPU_ASSERT (answer == STARPU_EXECUTION_COMPLETED);
			
 
				-
			
 
				-	void *arg_ptr = arg;
			
 
				-	int coreid;
			
 
				-
			
 
				-	coreid = *(int *) arg_ptr;
			
 
				-	arg_ptr += sizeof (coreid); // Useless.
			
 
				-
			
 
				-	struct _starpu_worker *worker = &workerset->workers[coreid];
			
 
				-	struct starpu_task *task = worker->current_task;
			
 
				-	struct _starpu_job *j = _starpu_get_job_associated_to_task (task);
			
 
				-
			
 
				-	_starpu_mic_src_finalize_job (j, worker);
			
 
				-
			
 
				-	worker->current_task = NULL;
			
 
				-
			
 
				-	return 0;
			
 
				-}
			
 
				-
			
 
				-
			
 
				-static int _starpu_mic_src_execute_job(struct _starpu_job *j, struct _starpu_worker *args)
			
 
				-{
			
 
				-	int ret;
			
 
				-	uint32_t mask = 0;
			
 
				-
			
 
				-	STARPU_ASSERT(j);
			
 
				-	struct starpu_task *task = j->task;
			
 
				-
			
 
				-	//struct timespec codelet_end;
			
 
				-
			
 
				-	int profiling = starpu_profiling_status_get();
			
 
				-	unsigned calibrate_model = 0;
			
 
				-
			
 
				-	STARPU_ASSERT(task);
			
 
				-	struct starpu_codelet *cl = task->cl;
			
 
				-	STARPU_ASSERT(cl);
			
 
				-
			
 
				-	if (cl->model && cl->model->benchmarking)
			
 
				-		calibrate_model = 1;
			
 
				-
			
 
				-	ret = _starpu_fetch_task_input(j, mask);
			
 
				-	if (ret != 0)
			
 
				-	{
			
 
				-		/* there was not enough memory, so the input of
			
 
				-		 * the codelet cannot be fetched ... put the
			
 
				-		 * codelet back, and try it later */
			
 
				-		return -EAGAIN;
			
 
				-	}
			
 
				-
			
 
				-
			
 
				-	starpu_mic_kernel_t kernel = _starpu_mic_src_get_kernel_from_codelet(j->task->cl, j->nimpl);
			
 
				-
			
 
				-	_starpu_driver_start_job (args, j, &j->cl_start, 0, profiling);
			
 
				-
			
 
				-	_starpu_src_common_execute_kernel_from_task(mic_nodes[args->mp_nodeid],
			
 
				-						    (void (*)(void)) kernel, args->devid, task);
			
 
				-
			
 
				-	return 0;
			
 
				-}
			
 
				-
			
 
				 int _starpu_mic_src_register_kernel(starpu_mic_func_symbol_t *symbol, const char *func_name)
			
 
				 {
			
 
				 	unsigned int func_name_size = (strlen(func_name) + 1) * sizeof(char);
			
@@ -248,9 +158,11 @@ int _starpu_mic_src_register_kernel(starpu_mic_func_symbol_t *symbol, const char
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				+
			
 
				 starpu_mic_kernel_t _starpu_mic_src_get_kernel(starpu_mic_func_symbol_t symbol)
			
 
				 {
			
 
				 	int workerid = starpu_worker_get_id();
			
 
				+	
			
 
				 	/* This function has to be called in the codelet only, by the thread
			
 
				 	 * which will handle the task */
			
 
				 	if (workerid < 0)
			
@@ -365,6 +277,43 @@ starpu_mic_kernel_t _starpu_mic_src_get_kernel_from_codelet(struct starpu_codele
 
				 	return kernel;
			
 
				 }
			
 
				 
			
 
				+
			
 
				+
			
 
				+void(* _starpu_mic_src_get_kernel_from_job(const struct _starpu_mp_node *node STARPU_ATTRIBUTE_UNUSED, struct _starpu_job *j))(void)
			
 
				+{
			
 
				+	starpu_mic_kernel_t kernel = NULL;
			
 
				+
			
 
				+	starpu_mic_func_t func = _starpu_task_get_mic_nth_implementation(j->task->cl, j->nimpl);
			
 
				+	if (func)
			
 
				+	{
			
 
				+		/* We execute the function contained in the codelet, it must return a
			
 
				+		 * pointer to the function to execute on the device, either specified
			
 
				+		 * directly by the user or by a call to starpu_mic_get_func().
			
 
				+		 */
			
 
				+		kernel = func();
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		/* If user dont define any starpu_mic_fun_t in cl->mic_func we try to use
			
 
				+		 * cpu_func_name.
			
 
				+		 */
			
 
				+		char *func_name = _starpu_task_get_cpu_name_nth_implementation(j->task->cl, j->nimpl);
			
 
				+		if (func_name)
			
 
				+		{
			
 
				+			starpu_mic_func_symbol_t symbol;
			
 
				+
			
 
				+			_starpu_mic_src_register_kernel(&symbol, func_name);
			
 
				+
			
 
				+			kernel = _starpu_mic_src_get_kernel(symbol);
			
 
				+		}
			
 
				+	}
			
 
				+	STARPU_ASSERT(kernel);
			
 
				+
			
 
				+	return (void (*)(void))kernel;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				 /* Initialize the node structure describing the MIC source.
			
 
				  */
			
 
				 void _starpu_mic_src_init(struct _starpu_mp_node *node)
			
@@ -553,18 +502,20 @@ int _starpu_mic_request_is_complete(struct _starpu_mic_async_event *event)
 
				 	return 1;
			
 
				 }
			
 
				 
			
 
				+
			
 
				+
			
 
				 void *_starpu_mic_src_worker(void *arg)
			
 
				 {
			
 
				-	struct _starpu_worker_set *args = arg;
			
 
				+	struct _starpu_worker_set *worker_set = arg;
			
 
				 	/* As all workers of a set share common data, we just use the first
			
 
				 	 * one for intializing the following stuffs. */
			
 
				-	struct _starpu_worker *baseworker = &args->workers[0];
			
 
				+	struct _starpu_worker *baseworker = &worker_set->workers[0];
			
 
				 	struct _starpu_machine_config *config = baseworker->config;
			
 
				 	unsigned baseworkerid = baseworker - config->workers;
			
 
				 	unsigned mp_nodeid = baseworker->mp_nodeid;
			
 
				 	unsigned i;
			
 
				 
			
 
				-	unsigned memnode = baseworker->memory_node;
			
 
				+	/* unsigned memnode = baseworker->memory_node; */
			
 
				 
			
 
				 	_starpu_worker_init(baseworker, _STARPU_FUT_MIC_KEY);
			
 
				 
			
@@ -582,131 +533,15 @@ void *_starpu_mic_src_worker(void *arg)
 
				 	_STARPU_TRACE_WORKER_INIT_END;
			
 
				 
			
 
				 	/* tell the main thread that this one is ready */
			
 
				-	STARPU_PTHREAD_MUTEX_LOCK(&args->mutex);
			
 
				-	args->set_is_initialized = 1;
			
 
				-	STARPU_PTHREAD_COND_SIGNAL(&args->ready_cond);
			
 
				-	STARPU_PTHREAD_MUTEX_UNLOCK(&args->mutex);
			
 
				-
			
 
				+	STARPU_PTHREAD_MUTEX_LOCK(&worker_set->mutex);
			
 
				+	worker_set->set_is_initialized = 1;
			
 
				+	STARPU_PTHREAD_COND_SIGNAL(&worker_set->ready_cond);
			
 
				+	STARPU_PTHREAD_MUTEX_UNLOCK(&worker_set->mutex);
			
 
				 
			
 
				-	while (_starpu_machine_is_running())
			
 
				-	{
			
 
				-		int res;
			
 
				-		struct starpu_task *task = NULL;
			
 
				-		struct _starpu_job * j;
			
 
				-		unsigned micworkerid = 0;
			
 
				-
			
 
				-		_STARPU_TRACE_START_PROGRESS(memnode);
			
 
				-		_starpu_datawizard_progress(memnode, 1);
			
 
				-		_STARPU_TRACE_END_PROGRESS(memnode);
			
 
				-
			
 
				-		STARPU_PTHREAD_MUTEX_LOCK(&baseworker->sched_mutex);
			
 
				-
			
 
				-		/* We pop tasklists of each worker in the set and process the
			
 
				-		 * first non-empty list. */
			
 
				-		for (micworkerid = 0 ; (micworkerid < args->nworkers) && (task == NULL); micworkerid++)
			
 
				-		    task = _starpu_pop_task (&args->workers[micworkerid]);
			
 
				-
			
 
				-		if (task != NULL) {
			
 
				-			micworkerid--;
			
 
				-			goto task_found;
			
 
				-		}
			
 
				-
			
 
				-#if 0 // XXX: synchronous execution for now
			
 
				-		/* No task to submit, so we can poll the MIC device for
			
 
				-		 * completed jobs. */
			
 
				-		struct pollfd fd = {
			
 
				-		    .fd = mic_nodes[baseworker->mp_nodeid]->mp_connection.mic_endpoint,
			
 
				-		    .events = POLLIN
			
 
				-		};
			
 
				-
			
 
				-		if (0 < poll (&fd, 1, 0)) {
			
 
				-		    _starpu_mic_src_process_completed_job (args);
			
 
				-		    goto restart_loop;
			
 
				-		}
			
 
				-#endif
			
 
				-
			
 
				-		/* At this point, there is really nothing to do for the thread
			
 
				-		 * so we can block.
			
 
				-		 * XXX: blocking drivers is in fact broken. DO NOT USE IT ! */
			
 
				-		if (_starpu_worker_get_status(baseworkerid) != STATUS_SLEEPING)
			
 
				-		{
			
 
				-			_STARPU_TRACE_WORKER_SLEEP_START;
			
 
				-			_starpu_worker_restart_sleeping(baseworkerid);
			
 
				-			_starpu_worker_set_status(baseworkerid, STATUS_SLEEPING);
			
 
				-		}
			
 
				-
			
 
				-		if (_starpu_worker_can_block(memnode))
			
 
				-			STARPU_PTHREAD_COND_WAIT(&baseworker->sched_cond, &baseworker->sched_mutex);
			
 
				-		else
			
 
				-		{
			
 
				-			if (_starpu_machine_is_running())
			
 
				-				STARPU_UYIELD();
			
 
				-		}
			
 
				-
			
 
				-		if (_starpu_worker_get_status(baseworkerid) == STATUS_SLEEPING)
			
 
				-		{
			
 
				-			_STARPU_TRACE_WORKER_SLEEP_END;
			
 
				-			_starpu_worker_stop_sleeping(baseworkerid);
			
 
				-			_starpu_worker_set_status(baseworkerid, STATUS_UNKNOWN);
			
 
				-		}
			
 
				-
			
 
				-	restart_loop:
			
 
				-		STARPU_PTHREAD_MUTEX_UNLOCK(&baseworker->sched_mutex);
			
 
				-		continue;
			
 
				-
			
 
				-	task_found:
			
 
				-		/* If the MIC core associated to `micworkerid' is already
			
 
				-		 * processing a job, we push back this one in the worker task
			
 
				-		 * list. */
			
 
				-		STARPU_PTHREAD_MUTEX_UNLOCK(&baseworker->sched_mutex);
			
 
				-
			
 
				-		if (args->workers[micworkerid].current_task) {
			
 
				-		    _starpu_push_task_to_workers(task);
			
 
				-		    continue;
			
 
				-		}
			
 
				-
			
 
				-		STARPU_ASSERT(task);
			
 
				-		j = _starpu_get_job_associated_to_task(task);
			
 
				-
			
 
				-		/* can a MIC device do that task ? */
			
 
				-		if (!_STARPU_MIC_MAY_PERFORM(j))
			
 
				-		{
			
 
				-			/* this isn't a mic task */
			
 
				-			_starpu_push_task_to_workers(task);
			
 
				-			continue;
			
 
				-		}
			
 
				-
			
 
				-		args->workers[micworkerid].current_task = j->task;
			
 
				-
			
 
				-		res = _starpu_mic_src_execute_job (j, &args->workers[micworkerid]);
			
 
				-
			
 
				-		if (res)
			
 
				-		{
			
 
				-			switch (res)
			
 
				-			{
			
 
				-				case -EAGAIN:
			
 
				-					_STARPU_DISP("ouch, Xeon Phi could not actually run task %p, putting it back...\n", task);
			
 
				-					_starpu_push_task_to_workers(task);
			
 
				-					STARPU_ABORT();
			
 
				-					continue;
			
 
				-				default:
			
 
				-					STARPU_ASSERT(0);
			
 
				-			}
			
 
				-		}
			
 
				-
			
 
				-		/* XXX: synchronous execution for now */
			
 
				-		_starpu_mic_src_process_completed_job (args);
			
 
				-	}
			
 
				+	_starpu_src_common_worker(worker_set, baseworkerid, mic_nodes[mp_nodeid]);
			
 
				 
			
 
				 	_STARPU_TRACE_WORKER_DEINIT_START;
			
 
				 
			
 
				-	_starpu_handle_all_pending_node_data_requests(memnode);
			
 
				-
			
 
				-	/* In case there remains some memory that was automatically
			
 
				-	 * allocated by StarPU, we release it now. Note that data
			
 
				-	 * coherency is not maintained anymore at that point ! */
			
 
				-	_starpu_free_all_automatically_allocated_buffers(memnode);
			
 
				-
			
 
				 	_STARPU_TRACE_WORKER_DEINIT_END(_STARPU_FUT_CUDA_KEY);
			
 
				 
			
 
				 	return NULL;
			
--- a/src/drivers/mic/driver_mic_source.h
+++ b/src/drivers/mic/driver_mic_source.h
@@ -24,6 +24,7 @@
 
				 
			
 
				 #include <source/COIProcess_source.h>
			
 
				 #include <source/COIEngine_source.h>
			
 
				+#include <core/workers.h>
			
 
				 
			
 
				 #include <drivers/mp_common/mp_common.h>
			
 
				 
			
@@ -41,9 +42,10 @@ struct _starpu_mic_async_event *event;
 
				 #define STARPU_MIC_SRC_REPORT_SCIF_ERROR(status) \
			
 
				 	_starpu_mic_src_report_scif_error(__starpu_func__, __FILE__, __LINE__, status)
			
 
				 
			
 
				-const struct _starpu_mp_node *_starpu_mic_src_get_actual_thread_mp_node();
			
 
				+struct _starpu_mp_node *_starpu_mic_src_get_actual_thread_mp_node();
			
 
				 const struct _starpu_mp_node *_starpu_mic_src_get_mp_node_from_memory_node(int memory_node);
			
 
				 
			
 
				+void(* _starpu_mic_src_get_kernel_from_job(const struct _starpu_mp_node *node STARPU_ATTRIBUTE_UNUSED, struct _starpu_job *j))(void);
			
 
				 int _starpu_mic_src_register_kernel(starpu_mic_func_symbol_t *symbol, const char *func_name);
			
 
				 starpu_mic_kernel_t _starpu_mic_src_get_kernel(starpu_mic_func_symbol_t symbol);
			
 
				 
			
--- a/src/drivers/mp_common/mp_common.c
+++ b/src/drivers/mp_common/mp_common.c
@@ -27,12 +27,14 @@
 
				 #include <drivers/scc/driver_scc_source.h>
			
 
				 #include <drivers/scc/driver_scc_sink.h>
			
 
				 
			
 
				+#include <common/list.h>
			
 
				+
			
 
				 /* Allocate and initialize the sink structure, when the function returns
			
 
				  * all the pointer of functions are linked to the right ones.
			
 
				  */
			
 
				 struct _starpu_mp_node * STARPU_ATTRIBUTE_MALLOC
			
 
				-    _starpu_mp_common_node_create(enum _starpu_mp_node_kind node_kind,
			
 
				-				  int peer_id)
			
 
				+_starpu_mp_common_node_create(enum _starpu_mp_node_kind node_kind,
			
 
				+			      int peer_id)
			
 
				 {
			
 
				 	struct _starpu_mp_node *node;
			
 
				 
			
@@ -45,111 +47,120 @@ struct _starpu_mp_node * STARPU_ATTRIBUTE_MALLOC
 
				 	switch(node->kind)
			
 
				 	{
			
 
				 #ifdef STARPU_USE_MIC
			
 
				-		case STARPU_MIC_SOURCE:
			
 
				-			{
			
 
				-				node->nb_mp_sinks = starpu_mic_worker_get_count();
			
 
				-				node->devid = peer_id;
			
 
				-
			
 
				-				node->init = _starpu_mic_src_init;
			
 
				-				node->deinit = _starpu_mic_src_deinit;
			
 
				-				node->report_error = _starpu_mic_src_report_scif_error;
			
 
				-
			
 
				-				node->mp_send = _starpu_mic_common_send;
			
 
				-				node->mp_recv = _starpu_mic_common_recv;
			
 
				-				node->dt_send = _starpu_mic_common_dt_send;
			
 
				-				node->dt_recv = _starpu_mic_common_dt_recv;
			
 
				-
			
 
				-				node->execute = NULL;
			
 
				-				node->nbcores = NULL;
			
 
				-				node->allocate = NULL;
			
 
				-				node->free = NULL;
			
 
				-
			
 
				-				/* A source node is only working on one core,
			
 
				-				 * there is no need for this function */
			
 
				-				node->get_nb_core = NULL;
			
 
				-			}
			
 
				-			break;
			
 
				-
			
 
				-		case STARPU_MIC_SINK:
			
 
				-			{
			
 
				-				node->devid = atoi(getenv("DEVID"));;
			
 
				-				node->nb_mp_sinks = atoi(getenv("NB_MIC"));
			
 
				-
			
 
				-				node->init = _starpu_mic_sink_init;
			
 
				-				node->deinit = _starpu_mic_sink_deinit;
			
 
				-				node->report_error = _starpu_mic_sink_report_error;
			
 
				-
			
 
				-				node->mp_send = _starpu_mic_common_send;
			
 
				-				node->mp_recv = _starpu_mic_common_recv;
			
 
				-				node->dt_send = _starpu_mic_common_dt_send;
			
 
				-				node->dt_recv = _starpu_mic_common_dt_recv;
			
 
				-
			
 
				-				node->execute = _starpu_sink_common_execute;
			
 
				-				node->nbcores = _starpu_sink_nbcores;
			
 
				-				node->allocate = _starpu_mic_sink_allocate;
			
 
				-				node->free = _starpu_mic_sink_free;
			
 
				-
			
 
				-				node->get_nb_core = _starpu_mic_sink_get_nb_core;
			
 
				-			}
			
 
				-			break;
			
 
				+	case STARPU_MIC_SOURCE:
			
 
				+	{
			
 
				+		node->nb_mp_sinks = starpu_mic_worker_get_count();
			
 
				+		node->devid = peer_id;
			
 
				+
			
 
				+		node->init = _starpu_mic_src_init;
			
 
				+		node->launch_workers= NULL;
			
 
				+		node->deinit = _starpu_mic_src_deinit;
			
 
				+		node->report_error = _starpu_mic_src_report_scif_error;
			
 
				+
			
 
				+		node->mp_recv_is_ready = _starpu_mic_common_recv_is_ready;
			
 
				+		node->mp_send = _starpu_mic_common_send;
			
 
				+		node->mp_recv = _starpu_mic_common_recv;
			
 
				+		node->dt_send = _starpu_mic_common_dt_send;
			
 
				+		node->dt_recv = _starpu_mic_common_dt_recv;
			
 
				+
			
 
				+		node->get_kernel_from_job =_starpu_mic_src_get_kernel_from_job;
			
 
				+		node->lookup = NULL;
			
 
				+		node->bind_thread = NULL;
			
 
				+		node->execute = NULL;
			
 
				+		node->allocate = NULL;
			
 
				+		node->free = NULL;
			
 
				+	}
			
 
				+	break;
			
 
				+
			
 
				+	case STARPU_MIC_SINK:
			
 
				+	{
			
 
				+		node->devid = atoi(getenv("DEVID"));;
			
 
				+		node->nb_mp_sinks = atoi(getenv("NB_MIC"));
			
 
				+
			
 
				+		node->init = _starpu_mic_sink_init;
			
 
				+		node->launch_workers = _starpu_mic_sink_launch_workers;
			
 
				+		node->deinit = _starpu_mic_sink_deinit;
			
 
				+		node->report_error = _starpu_mic_sink_report_error;
			
 
				+
			
 
				+		node->mp_recv_is_ready = _starpu_mic_common_recv_is_ready;
			
 
				+		node->mp_send = _starpu_mic_common_send;
			
 
				+		node->mp_recv = _starpu_mic_common_recv;
			
 
				+		node->dt_send = _starpu_mic_common_dt_send;
			
 
				+		node->dt_recv = _starpu_mic_common_dt_recv;
			
 
				+
			
 
				+		node->get_kernel_from_job = NULL;
			
 
				+		node->lookup = _starpu_mic_sink_lookup;
			
 
				+		node->bind_thread = _starpu_mic_sink_bind_thread;
			
 
				+		node->execute = _starpu_sink_common_execute;
			
 
				+		node->allocate = _starpu_mic_sink_allocate;
			
 
				+		node->free = _starpu_mic_sink_free;
			
 
				+
			
 
				+	}
			
 
				+	break;
			
 
				 #endif /* STARPU_USE_MIC */
			
 
				 
			
 
				 #ifdef STARPU_USE_SCC
			
 
				-		case STARPU_SCC_SOURCE:
			
 
				-			{
			
 
				-				node->init = _starpu_scc_src_init;
			
 
				-				node->deinit = NULL;
			
 
				-				node->report_error = _starpu_scc_common_report_rcce_error;
			
 
				-
			
 
				-				node->mp_send = _starpu_scc_common_send;
			
 
				-				node->mp_recv = _starpu_scc_common_recv;
			
 
				-				node->dt_send = _starpu_scc_common_send;
			
 
				-				node->dt_recv = _starpu_scc_common_recv;
			
 
				-				node->dt_send_to_device = NULL;
			
 
				-				node->dt_recv_from_device = NULL;
			
 
				-
			
 
				-				node->execute = NULL;
			
 
				-				node->allocate = NULL;
			
 
				-				node->free = NULL;
			
 
				-
			
 
				-				node->get_nb_core = NULL;
			
 
				-			}
			
 
				-			break;
			
 
				-
			
 
				-		case STARPU_SCC_SINK:
			
 
				-			{
			
 
				-				node->init = _starpu_scc_sink_init;
			
 
				-				node->deinit = _starpu_scc_sink_deinit;
			
 
				-				node->report_error = _starpu_scc_common_report_rcce_error;
			
 
				-
			
 
				-				node->mp_send = _starpu_scc_common_send;
			
 
				-				node->mp_recv = _starpu_scc_common_recv;
			
 
				-				node->dt_send = _starpu_scc_common_send;
			
 
				-				node->dt_recv = _starpu_scc_common_recv;
			
 
				-				node->dt_send_to_device = _starpu_scc_sink_send_to_device;
			
 
				-				node->dt_recv_from_device = _starpu_scc_sink_recv_from_device;
			
 
				-
			
 
				-				node->execute = _starpu_scc_sink_execute;
			
 
				-				node->allocate = _starpu_sink_common_allocate;
			
 
				-				node->free = _starpu_sink_common_free;
			
 
				-
			
 
				-				node->get_nb_core = NULL;
			
 
				-			}
			
 
				-			break;
			
 
				+	case STARPU_SCC_SOURCE:
			
 
				+	{
			
 
				+		node->init = _starpu_scc_src_init;
			
 
				+		node->deinit = NULL;
			
 
				+		node->deinit = NULL;
			
 
				+		node->report_error = _starpu_scc_common_report_rcce_error;
			
 
				+				
			
 
				+		node->mp_recv_is_ready = _starpu_scc_common_recv_is_ready;
			
 
				+		node->mp_send = _starpu_scc_common_send;
			
 
				+		node->mp_recv = _starpu_scc_common_recv;
			
 
				+		node->dt_send = _starpu_scc_common_send;
			
 
				+		node->dt_recv = _starpu_scc_common_recv;
			
 
				+		node->dt_send_to_device = NULL;
			
 
				+		node->dt_recv_from_device = NULL;
			
 
				+
			
 
				+		node->get_kernel_from_job =_starpu_scc_src_get_kernel_from_job;
			
 
				+		node->lookup = NULL;
			
 
				+		node->bind_thread = NULL;
			
 
				+		node->execute = NULL;
			
 
				+		node->allocate = NULL;
			
 
				+		node->free = NULL;
			
 
				+	}
			
 
				+	break;
			
 
				+
			
 
				+	case STARPU_SCC_SINK:
			
 
				+	{
			
 
				+		node->init = _starpu_scc_sink_init;
			
 
				+		node->launch_workers = _starpu_scc_sink_launch_workers;
			
 
				+		node->deinit = _starpu_scc_sink_deinit;
			
 
				+		node->report_error = _starpu_scc_common_report_rcce_error;
			
 
				+
			
 
				+		node->mp_recv_is_ready = _starpu_scc_common_recv_is_ready;
			
 
				+		node->mp_send = _starpu_scc_common_send;
			
 
				+		node->mp_recv = _starpu_scc_common_recv;
			
 
				+		node->dt_send = _starpu_scc_common_send;
			
 
				+		node->dt_recv = _starpu_scc_common_recv;
			
 
				+		node->dt_send_to_device = _starpu_scc_sink_send_to_device;
			
 
				+		node->dt_recv_from_device = _starpu_scc_sink_recv_from_device;
			
 
				+
			
 
				+		node->get_kernel_from_job = NULL;
			
 
				+		node->lookup = _starpu_scc_sink_lookup;
			
 
				+		node->bind_thread = _starpu_scc_sink_bind_thread;
			
 
				+		node->execute = _starpu_scc_sink_execute;
			
 
				+		node->allocate = _starpu_sink_common_allocate;
			
 
				+		node->free = _starpu_sink_common_free;
			
 
				+	}
			
 
				+	break;
			
 
				 #endif /* STARPU_USE_SCC */
			
 
				 
			
 
				 #ifdef STARPU_USE_MPI
			
 
				-		case STARPU_MPI_SOURCE:
			
 
				-			STARPU_ABORT();
			
 
				-			break;
			
 
				+	case STARPU_MPI_SOURCE:
			
 
				+		STARPU_ABORT();
			
 
				+		break;
			
 
				 
			
 
				-		case STARPU_MPI_SINK:
			
 
				-			STARPU_ABORT();
			
 
				-			break;
			
 
				+	case STARPU_MPI_SINK:
			
 
				+		STARPU_ABORT();
			
 
				+		break;
			
 
				 #endif /* STARPU_USE_MPI */
			
 
				 
			
 
				-		default:
			
 
				-			STARPU_ASSERT(0);
			
 
				+	default:
			
 
				+		STARPU_ASSERT(0);
			
 
				 	}
			
 
				 
			
 
				 	/* Let's allocate the buffer, we want it to be big enough to contain
			
@@ -159,15 +170,60 @@ struct _starpu_mp_node * STARPU_ATTRIBUTE_MALLOC
 
				 	if (node->init)
			
 
				 		node->init(node);
			
 
				 
			
 
				+	node->message_queue = mp_message_list_new();
			
 
				+	STARPU_PTHREAD_MUTEX_INIT(&node->message_queue_mutex,NULL);
			
 
				+
			
 
				+	/* If the node is a sink then we must initialize some field */
			
 
				+	if(node->kind == STARPU_MIC_SINK || node->kind == STARPU_SCC_SINK)
			
 
				+	{
			
 
				+		int i;
			
 
				+		node->is_running = 1;
			
 
				+		node->run_table = malloc(sizeof(struct mp_task *)*node->nb_cores);
			
 
				+		node->sem_run_table = malloc(sizeof(sem_t)*node->nb_cores);
			
 
				+
			
 
				+		for(i=0; i<node->nb_cores; i++)
			
 
				+		{
			
 
				+			node->run_table[i] = NULL;
			
 
				+			sem_init(&node->sem_run_table[i],0,0);
			
 
				+		}
			
 
				+		node->barrier_list = mp_barrier_list_new();
			
 
				+		STARPU_PTHREAD_MUTEX_INIT(&node->barrier_mutex,NULL);
			
 
				+
			
 
				+		STARPU_PTHREAD_BARRIER_INIT(&node->init_completed_barrier, NULL, node->nb_cores+1);
			
 
				+
			
 
				+		node->launch_workers(node);
			
 
				+	}	
			
 
				+
			
 
				+
			
 
				 	return node;
			
 
				 }
			
 
				 
			
 
				 /* Deinitialize the sink structure and release the structure */
			
 
				-
			
 
				 void _starpu_mp_common_node_destroy(struct _starpu_mp_node *node)
			
 
				 {
			
 
				 	if (node->deinit)
			
 
				 		node->deinit(node);
			
 
				+		
			
 
				+	mp_message_list_delete(node->message_queue);
			
 
				+	STARPU_PTHREAD_MUTEX_DESTROY(&node->message_queue_mutex);
			
 
				+
			
 
				+	/* If the node is a sink then we must destroy some field */
			
 
				+	if(node->kind == STARPU_MIC_SINK || node->kind == STARPU_SCC_SINK)
			
 
				+	{
			
 
				+		int i;
			
 
				+		for(i=0; i<node->nb_cores; i++)
			
 
				+		{
			
 
				+			sem_destroy(&node->sem_run_table[i]);
			
 
				+		}
			
 
				+
			
 
				+		free(node->run_table);
			
 
				+		free(node->sem_run_table);
			
 
				+
			
 
				+		mp_barrier_list_delete(node->barrier_list);
			
 
				+
			
 
				+		STARPU_PTHREAD_MUTEX_DESTROY(&node->barrier_mutex);
			
 
				+		STARPU_PTHREAD_BARRIER_DESTROY(&node->init_completed_barrier);
			
 
				+	}
			
 
				 
			
 
				 	free(node->buffer);
			
 
				 
			
@@ -175,7 +231,6 @@ void _starpu_mp_common_node_destroy(struct _starpu_mp_node *node)
 
				 }
			
 
				 
			
 
				 /* Send COMMAND to RECIPIENT, along with ARG if ARG_SIZE is non-zero */
			
 
				-
			
 
				 void _starpu_mp_common_send_command(const struct _starpu_mp_node *node,
			
 
				 				    const enum _starpu_mp_command command,
			
 
				 				    void *arg, int arg_size)
			
@@ -202,7 +257,6 @@ void _starpu_mp_common_send_command(const struct _starpu_mp_node *node,
 
				  * However, the data pointed by arg shouldn't be relied on after a new call to
			
 
				  * STARPU_MP_COMMON_RECV_COMMAND as it might corrupt it.
			
 
				  */
			
 
				-
			
 
				 enum _starpu_mp_command _starpu_mp_common_recv_command(const struct _starpu_mp_node *node,
			
 
				 						       void **arg, int *arg_size)
			
 
				 {
			
--- a/src/drivers/mp_common/mp_common.h
+++ b/src/drivers/mp_common/mp_common.h
@@ -18,10 +18,14 @@
 
				 #define __MP_COMMON_H__
			
 
				 
			
 
				 #include <pthread.h>
			
 
				+#include <semaphore.h>
			
 
				 
			
 
				 #include <starpu.h>
			
 
				 #include <common/config.h>
			
 
				-
			
 
				+#include <common/list.h>
			
 
				+#include <common/barrier.h>
			
 
				+#include <common/thread.h>
			
 
				+#include <datawizard/interfaces/data_interface.h>
			
 
				 
			
 
				 #ifdef STARPU_USE_MP
			
 
				 
			
@@ -34,31 +38,31 @@
 
				 #define STARPU_MP_SRC_NODE 0
			
 
				 #define STARPU_MP_SINK_NODE(a) ((a) + 1)
			
 
				 
			
 
				-#define STARPU_MP_COMMON_REPORT_ERROR(node, status) \
			
 
				+#define STARPU_MP_COMMON_REPORT_ERROR(node, status)			\
			
 
				 	(node)->report_error(__starpu_func__, __FILE__, __LINE__, (status))
			
 
				-
			
 
				-
			
 
				 enum _starpu_mp_command
			
 
				 {
			
 
				-	STARPU_EXIT = 0x00,
			
 
				-	STARPU_EXECUTE = 0x01,
			
 
				-	STARPU_ERROR_EXECUTE = 0x02,
			
 
				-	STARPU_LOOKUP = 0X03,
			
 
				-	STARPU_ANSWER_LOOKUP = 0X04,
			
 
				-	STARPU_ERROR_LOOKUP = 0X05,
			
 
				-	STARPU_ALLOCATE = 0x06,
			
 
				-	STARPU_ANSWER_ALLOCATE = 0x07,
			
 
				-	STARPU_ERROR_ALLOCATE = 0x08,
			
 
				-	STARPU_FREE = 0x09,
			
 
				-	STARPU_RECV_FROM_HOST = 0x10,
			
 
				-	STARPU_SEND_TO_HOST = 0x11,
			
 
				-	STARPU_RECV_FROM_SINK = 0x12,
			
 
				-	STARPU_SEND_TO_SINK = 0x13,
			
 
				-	STARPU_TRANSFER_COMPLETE = 0x14,
			
 
				-	STARPU_SINK_NBCORES = 0x15,
			
 
				-	STARPU_ANSWER_SINK_NBCORES = 0x16,
			
 
				-	STARPU_EXECUTION_SUBMITTED = 0x17,
			
 
				-	STARPU_EXECUTION_COMPLETED = 0x18
			
 
				+	STARPU_EXIT,
			
 
				+	STARPU_EXECUTE,
			
 
				+	STARPU_ERROR_EXECUTE,
			
 
				+	STARPU_LOOKUP,
			
 
				+	STARPU_ANSWER_LOOKUP,
			
 
				+	STARPU_ERROR_LOOKUP,
			
 
				+	STARPU_ALLOCATE,
			
 
				+	STARPU_ANSWER_ALLOCATE,
			
 
				+	STARPU_ERROR_ALLOCATE,
			
 
				+	STARPU_FREE,
			
 
				+	STARPU_RECV_FROM_HOST,
			
 
				+	STARPU_SEND_TO_HOST,
			
 
				+	STARPU_RECV_FROM_SINK,
			
 
				+	STARPU_SEND_TO_SINK,
			
 
				+	STARPU_TRANSFER_COMPLETE,
			
 
				+	STARPU_SINK_NBCORES,
			
 
				+	STARPU_ANSWER_SINK_NBCORES,
			
 
				+	STARPU_EXECUTION_SUBMITTED,
			
 
				+	STARPU_EXECUTION_COMPLETED,
			
 
				+	STARPU_PRE_EXECUTION,
			
 
				+	STARPU_SYNC_WORKERS,
			
 
				 };
			
 
				 
			
 
				 enum _starpu_mp_node_kind
			
@@ -96,12 +100,47 @@ struct _starpu_mp_transfer_command_to_device
 
				 	void *addr;
			
 
				 };
			
 
				 
			
 
				+LIST_TYPE(mp_barrier,
			
 
				+		int id;
			
 
				+		_starpu_pthread_barrier_t before_work_barrier;
			
 
				+		_starpu_pthread_barrier_t after_work_barrier;
			
 
				+	 );
			
 
				+
			
 
				+LIST_TYPE(mp_message,
			
 
				+		enum _starpu_mp_command type;
			
 
				+		char buffer[BUFFER_SIZE];
			
 
				+		int size;
			
 
				+	 );
			
 
				+
			
 
				+struct mp_task 
			
 
				+{
			
 
				+	void (*kernel)(void **, void *);
			
 
				+	void * interfaces[STARPU_NMAXBUFS]; 
			
 
				+	unsigned nb_interfaces;
			
 
				+	void *cl_arg;
			
 
				+	unsigned coreid;
			
 
				+	enum starpu_codelet_type type;
			
 
				+	int is_parallel_task;
			
 
				+	int combined_workerid;
			
 
				+ 	struct mp_barrier* mp_barrier;
			
 
				+};
			
 
				+
			
 
				+
			
 
				 /* Message-passing working node, whether source
			
 
				  * or sink */
			
 
				 struct _starpu_mp_node
			
 
				 {
			
 
				 	enum _starpu_mp_node_kind kind;
			
 
				 
			
 
				+	int baseworkerid;
			
 
				+
			
 
				+	/*the number of core on the device
			
 
				+	 * Must be initialized during init function*/
			
 
				+	int nb_cores;
			
 
				+
			
 
				+	/*Is starpu running*/
			
 
				+	int is_running;
			
 
				+
			
 
				 	/* Buffer used for scif data transfers, allocated
			
 
				 	 * during node initialization.
			
 
				 	 * Size : BUFFER_SIZE */
			
@@ -117,7 +156,7 @@ struct _starpu_mp_node
 
				 	int devid;
			
 
				 
			
 
				 	/* Only MIC use this for now !!
			
 
				-	*  Is the number ok MIC on the system. */
			
 
				+	 *  Is the number ok MIC on the system. */
			
 
				 	unsigned int nb_mp_sinks;
			
 
				 
			
 
				 	/* Connection used for command passing between the host thread and the
			
@@ -138,12 +177,32 @@ struct _starpu_mp_node
 
				 	 *  - sink_sink_dt_connections[j] is not initialized for the sink number j. */
			
 
				 	union _starpu_mp_connection *sink_sink_dt_connections;
			
 
				 
			
 
				+	/* */
			
 
				+	_starpu_pthread_barrier_t init_completed_barrier; 
			
 
				+	
			
 
				+	/* table to store pointer of the thread workers*/
			
 
				+	void* thread_table;
			
 
				+
			
 
				+        /*list where threads add messages to send to the source node */
			
 
				+        struct mp_message_list* message_queue;
			
 
				+	starpu_pthread_mutex_t message_queue_mutex;
			
 
				+
			
 
				+	/*list of barrier for combined worker*/
			
 
				+	struct mp_barrier_list* barrier_list;
			
 
				+	starpu_pthread_mutex_t barrier_mutex;
			
 
				+
			
 
				+	/*table where worker comme pick task*/
			
 
				+	struct mp_task ** run_table;
			
 
				+	sem_t * sem_run_table;
			
 
				+
			
 
				 	/* Node general functions */
			
 
				 	void (*init)(struct _starpu_mp_node *node);
			
 
				+	void (*launch_workers)(struct _starpu_mp_node *node);
			
 
				 	void (*deinit)(struct _starpu_mp_node *node);
			
 
				 	void (*report_error)(const char *, const char *, const int, const int);
			
 
				 
			
 
				 	/* Message passing */
			
 
				+	int (*mp_recv_is_ready)(const struct _starpu_mp_node *);
			
 
				 	void (*mp_send)(const struct _starpu_mp_node *, void *, int);
			
 
				 	void (*mp_recv)(const struct _starpu_mp_node *, void *, int);
			
 
				 
			
@@ -153,12 +212,12 @@ struct _starpu_mp_node
 
				 	void (*dt_send_to_device)(const struct _starpu_mp_node *, int, void *, int);
			
 
				 	void (*dt_recv_from_device)(const struct _starpu_mp_node *, int, void *, int);
			
 
				 
			
 
				-	void (*execute)(const struct _starpu_mp_node *, void *, int);
			
 
				-	void (*nbcores)(const struct _starpu_mp_node *);
			
 
				+	void (*(*get_kernel_from_job)(const struct _starpu_mp_node *,struct _starpu_job *))(void);
			
 
				+	void (*(*lookup)(const struct _starpu_mp_node *, char* ))(void);
			
 
				+	void (*bind_thread)(const struct _starpu_mp_node *, int,int *,int);
			
 
				+	void (*execute)(struct _starpu_mp_node *, void *, int);
			
 
				 	void (*allocate)(const struct _starpu_mp_node *, void *, int);
			
 
				 	void (*free)(const struct _starpu_mp_node *, void *, int);
			
 
				-
			
 
				-	unsigned int (*get_nb_core)(void);
			
 
				 };
			
 
				 
			
 
				 struct _starpu_mp_node * _starpu_mp_common_node_create(enum _starpu_mp_node_kind node_kind, int peer_devid);
			
@@ -170,7 +229,7 @@ void _starpu_mp_common_send_command(const struct _starpu_mp_node *node,
 
				 				    void *arg, int arg_size);
			
 
				 
			
 
				 enum _starpu_mp_command _starpu_mp_common_recv_command(const struct _starpu_mp_node *node,
			
 
				-						    void **arg, int *arg_size);
			
 
				+						       void **arg, int *arg_size);
			
 
				 
			
 
				 
			
 
				 #endif /* STARPU_USE_MP */
			
--- a/src/drivers/mp_common/sink_common.c
+++ b/src/drivers/mp_common/sink_common.c
@@ -15,20 +15,21 @@
 
				  */
			
 
				 
			
 
				 
			
 
				-#include <dlfcn.h>
			
 
				-
			
 
				 #include <starpu.h>
			
 
				 #include <common/config.h>
			
 
				 #include <common/utils.h>
			
 
				 #include <drivers/mp_common/mp_common.h>
			
 
				 #include <datawizard/interfaces/data_interface.h>
			
 
				-
			
 
				+#include <common/barrier.h>
			
 
				+#include <core/workers.h>
			
 
				+#include <common/barrier_counter.h>
			
 
				 #ifdef STARPU_USE_MIC
			
 
				 #include <common/COISysInfo_common.h>
			
 
				 #endif
			
 
				 
			
 
				 #include "sink_common.h"
			
 
				 
			
 
				+
			
 
				 /* Return the sink kind of the running process, based on the value of the
			
 
				  * STARPU_SINK environment variable.
			
 
				  * If there is no valid value retrieved, return STARPU_INVALID_KIND
			
@@ -50,90 +51,25 @@ static enum _starpu_mp_node_kind _starpu_sink_common_get_kind(void)
 
				 		return STARPU_INVALID_KIND;
			
 
				 }
			
 
				 
			
 
				-void
			
 
				-_starpu_sink_nbcores (const struct _starpu_mp_node *node)
			
 
				-{
			
 
				-    // Process packet received from `_starpu_src_common_sink_cores'.
			
 
				-    int nbcores = 1;
			
 
				-
			
 
				-#ifdef STARPU_USE_MIC
			
 
				-    // XXX I currently only support MIC for now.
			
 
				-    if (STARPU_MIC_SINK == _starpu_sink_common_get_kind ())
			
 
				-	nbcores = COISysGetCoreCount();
			
 
				-#endif
			
 
				 
			
 
				-    _starpu_mp_common_send_command (node, STARPU_ANSWER_SINK_NBCORES,
			
 
				-				    &nbcores, sizeof (int));
			
 
				-}
			
 
				-
			
 
				-
			
 
				-/* Receive paquet from _starpu_src_common_execute_kernel in the form below :
			
 
				- * [Function pointer on sink, number of interfaces, interfaces
			
 
				- * (union _starpu_interface), cl_arg]
			
 
				- * Then call the function given, passing as argument an array containing the
			
 
				- * addresses of the received interfaces
			
 
				+/* Send to host the number of cores of the sink device
			
 
				  */
			
 
				-void _starpu_sink_common_execute(const struct _starpu_mp_node *node,
			
 
				-					void *arg, int arg_size)
			
 
				+static void _starpu_sink_common_get_nb_cores (struct _starpu_mp_node *node)
			
 
				 {
			
 
				-	unsigned id = 0;
			
 
				-
			
 
				-	void *arg_ptr = arg;
			
 
				-	void (*kernel)(void **, void *) = NULL;
			
 
				-	unsigned coreid = 0;
			
 
				-	unsigned nb_interfaces = 0;
			
 
				-	void *interfaces[STARPU_NMAXBUFS];
			
 
				-	void *cl_arg;
			
 
				-
			
 
				-	kernel = *(void(**)(void **, void *)) arg_ptr;
			
 
				-	arg_ptr += sizeof(kernel);
			
 
				-
			
 
				-	coreid = *(unsigned *) arg_ptr;
			
 
				-	arg_ptr += sizeof(coreid);
			
 
				-
			
 
				-	nb_interfaces = *(unsigned *) arg_ptr;
			
 
				-	arg_ptr += sizeof(nb_interfaces);
			
 
				-
			
 
				-	/* The function needs an array pointing to each interface it needs
			
 
				-	 * during execution. As in sink-side there is no mean to know which
			
 
				-	 * kind of interface to expect, the array is composed of unions of
			
 
				-	 * interfaces, thus we expect the same size anyway */
			
 
				-	for (id = 0; id < nb_interfaces; id++)
			
 
				-	{
			
 
				-		interfaces[id] = arg_ptr;
			
 
				-		arg_ptr += sizeof(union _starpu_interface);
			
 
				-	}
			
 
				-
			
 
				-	/* Was cl_arg sent ? */
			
 
				-	if (arg_size > arg_ptr - arg)
			
 
				-		cl_arg = arg_ptr;
			
 
				-	else
			
 
				-		cl_arg = NULL;
			
 
				-
			
 
				-	//_STARPU_DEBUG("telling host that we have submitted the task %p.\n", kernel);
			
 
				-	/* XXX: in the future, we will not have to directly execute the kernel
			
 
				-	 * but submit it to the correct local worker. */
			
 
				-	_starpu_mp_common_send_command(node, STARPU_EXECUTION_SUBMITTED,
			
 
				-				       NULL, 0);
			
 
				-
			
 
				-	//_STARPU_DEBUG("executing the task %p\n", kernel);
			
 
				-	/* XXX: we keep the synchronous execution model on the sink side for
			
 
				-	 * now. */
			
 
				-	kernel(interfaces, cl_arg);
			
 
				-
			
 
				-	//_STARPU_DEBUG("telling host that we have finished the task %p.\n", kernel);
			
 
				-	_starpu_mp_common_send_command(node, STARPU_EXECUTION_COMPLETED,
			
 
				-				       &coreid, sizeof(coreid));
			
 
				+	// Process packet received from `_starpu_src_common_sink_cores'.
			
 
				+     	_starpu_mp_common_send_command (node, STARPU_ANSWER_SINK_NBCORES,
			
 
				+					&node->nb_cores, sizeof (int));
			
 
				 }
			
 
				 
			
 
				 
			
 
				+/* Send to host the address of the function given in parameter
			
 
				+ */
			
 
				 static void _starpu_sink_common_lookup(const struct _starpu_mp_node *node,
			
 
				 				       char *func_name)
			
 
				 {
			
 
				 	void (*func)(void);
			
 
				-	void *dl_handle = dlopen(NULL, RTLD_NOW);
			
 
				-	func = dlsym(dl_handle, func_name);
			
 
				-
			
 
				+	func = node->lookup(node,func_name);
			
 
				+	
			
 
				 	//_STARPU_DEBUG("Looked up %s, got %p\n", func_name, func);
			
 
				 
			
 
				 	/* If we couldn't find the function, let's send an error to the host.
			
@@ -146,21 +82,24 @@ static void _starpu_sink_common_lookup(const struct _starpu_mp_node *node,
 
				 					       NULL, 0);
			
 
				 }
			
 
				 
			
 
				+
			
 
				+/* Allocate a memory space and send the address of this space to the host
			
 
				+ */
			
 
				 void _starpu_sink_common_allocate(const struct _starpu_mp_node *mp_node,
			
 
				 				  void *arg, int arg_size)
			
 
				 {
			
 
				-    STARPU_ASSERT(arg_size == sizeof(size_t));
			
 
				-
			
 
				-    void *addr = malloc(*(size_t *)(arg));
			
 
				-
			
 
				-    /* If the allocation fail, let's send an error to the host.
			
 
				-     */
			
 
				-    if (addr)
			
 
				-	_starpu_mp_common_send_command(mp_node, STARPU_ANSWER_ALLOCATE,
			
 
				-				       &addr, sizeof(addr));
			
 
				-    else
			
 
				-	_starpu_mp_common_send_command(mp_node, STARPU_ERROR_ALLOCATE,
			
 
				-				       NULL, 0);
			
 
				+	STARPU_ASSERT(arg_size == sizeof(size_t));
			
 
				+
			
 
				+	void *addr = malloc(*(size_t *)(arg));
			
 
				+
			
 
				+	/* If the allocation fail, let's send an error to the host.
			
 
				+	 */
			
 
				+	if (addr)
			
 
				+		_starpu_mp_common_send_command(mp_node, STARPU_ANSWER_ALLOCATE,
			
 
				+					       &addr, sizeof(addr));
			
 
				+	else
			
 
				+		_starpu_mp_common_send_command(mp_node, STARPU_ERROR_ALLOCATE,
			
 
				+					       NULL, 0);
			
 
				 }
			
 
				 
			
 
				 void _starpu_sink_common_free(const struct _starpu_mp_node *mp_node STARPU_ATTRIBUTE_UNUSED,
			
@@ -174,56 +113,129 @@ void _starpu_sink_common_free(const struct _starpu_mp_node *mp_node STARPU_ATTRI
 
				 static void _starpu_sink_common_copy_from_host(const struct _starpu_mp_node *mp_node,
			
 
				 					       void *arg, int arg_size)
			
 
				 {
			
 
				-    STARPU_ASSERT(arg_size == sizeof(struct _starpu_mp_transfer_command));
			
 
				+	STARPU_ASSERT(arg_size == sizeof(struct _starpu_mp_transfer_command));
			
 
				 
			
 
				-    struct _starpu_mp_transfer_command *cmd = (struct _starpu_mp_transfer_command *)arg;
			
 
				+	struct _starpu_mp_transfer_command *cmd = (struct _starpu_mp_transfer_command *)arg;
			
 
				 
			
 
				-    mp_node->dt_recv(mp_node, cmd->addr, cmd->size);
			
 
				+	mp_node->dt_recv(mp_node, cmd->addr, cmd->size);
			
 
				 }
			
 
				 
			
 
				 static void _starpu_sink_common_copy_to_host(const struct _starpu_mp_node *mp_node,
			
 
				 					     void *arg, int arg_size)
			
 
				 {
			
 
				-    STARPU_ASSERT(arg_size == sizeof(struct _starpu_mp_transfer_command));
			
 
				+	STARPU_ASSERT(arg_size == sizeof(struct _starpu_mp_transfer_command));
			
 
				 
			
 
				-    struct _starpu_mp_transfer_command *cmd = (struct _starpu_mp_transfer_command *)arg;
			
 
				+	struct _starpu_mp_transfer_command *cmd = (struct _starpu_mp_transfer_command *)arg;
			
 
				 
			
 
				-    mp_node->dt_send(mp_node, cmd->addr, cmd->size);
			
 
				+	mp_node->dt_send(mp_node, cmd->addr, cmd->size);
			
 
				 }
			
 
				 
			
 
				 static void _starpu_sink_common_copy_from_sink(const struct _starpu_mp_node *mp_node,
			
 
				 					       void *arg, int arg_size)
			
 
				 {
			
 
				-    STARPU_ASSERT(arg_size == sizeof(struct _starpu_mp_transfer_command_to_device));
			
 
				+	STARPU_ASSERT(arg_size == sizeof(struct _starpu_mp_transfer_command_to_device));
			
 
				 
			
 
				-    struct _starpu_mp_transfer_command_to_device *cmd = (struct _starpu_mp_transfer_command_to_device *)arg;
			
 
				+	struct _starpu_mp_transfer_command_to_device *cmd = (struct _starpu_mp_transfer_command_to_device *)arg;
			
 
				 
			
 
				-    mp_node->dt_recv_from_device(mp_node, cmd->devid, cmd->addr, cmd->size);
			
 
				+	mp_node->dt_recv_from_device(mp_node, cmd->devid, cmd->addr, cmd->size);
			
 
				 
			
 
				-    _starpu_mp_common_send_command(mp_node, STARPU_TRANSFER_COMPLETE, NULL, 0);
			
 
				+	_starpu_mp_common_send_command(mp_node, STARPU_TRANSFER_COMPLETE, NULL, 0);
			
 
				 }
			
 
				 
			
 
				 static void _starpu_sink_common_copy_to_sink(const struct _starpu_mp_node *mp_node,
			
 
				 					     void *arg, int arg_size)
			
 
				 {
			
 
				-    STARPU_ASSERT(arg_size == sizeof(struct _starpu_mp_transfer_command_to_device));
			
 
				+	STARPU_ASSERT(arg_size == sizeof(struct _starpu_mp_transfer_command_to_device));
			
 
				+
			
 
				+	struct _starpu_mp_transfer_command_to_device *cmd = (struct _starpu_mp_transfer_command_to_device *)arg;
			
 
				+
			
 
				+	mp_node->dt_send_to_device(mp_node, cmd->devid, cmd->addr, cmd->size);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/* Receive workers and combined workers and store them into the struct config
			
 
				+ */
			
 
				+static void _starpu_sink_common_recv_workers(struct _starpu_mp_node * node, void *arg, int arg_size)
			
 
				+{
			
 
				+	/* Retrieve information from the message */
			
 
				+	STARPU_ASSERT(arg_size == (sizeof(int)*5));
			
 
				+	void * arg_ptr = arg;
			
 
				+	int i;
			
 
				+	
			
 
				+	int nworkers = *(int *)arg_ptr; 
			
 
				+	arg_ptr += sizeof(nworkers);
			
 
				+
			
 
				+	int worker_size = *(int *)arg_ptr;
			
 
				+	arg_ptr += sizeof(worker_size);
			
 
				+
			
 
				+	int combined_worker_size = *(int *)arg_ptr;
			
 
				+	arg_ptr += sizeof(combined_worker_size);
			
 
				+	
			
 
				+	int baseworkerid = *(int *)arg_ptr;
			
 
				+	arg_ptr += sizeof(baseworkerid);
			
 
				+
			
 
				+	struct _starpu_machine_config *config = _starpu_get_machine_config();
			
 
				+	config->topology.nworkers = *(int *)arg_ptr;
			
 
				+
			
 
				+
			
 
				+	/* Retrieve workers */
			
 
				+	struct _starpu_worker * workers = &config->workers[baseworkerid];
			
 
				+	node->dt_recv(node,workers,worker_size);
			
 
				+	
			
 
				+	/* Update workers to have coherent field */
			
 
				+	for(i=0; i<nworkers; i++)
			
 
				+	{
			
 
				+		workers[i].config = config;
			
 
				+		starpu_pthread_mutex_init(&workers[i].mutex,NULL);
			
 
				+		starpu_pthread_mutex_destroy(&workers[i].mutex);
			
 
				+
			
 
				+		starpu_pthread_cond_init(&workers[i].started_cond,NULL);
			
 
				+		starpu_pthread_cond_destroy(&workers[i].started_cond);
			
 
				+
			
 
				+		starpu_pthread_cond_init(&workers[i].ready_cond,NULL);
			
 
				+		starpu_pthread_cond_destroy(&workers[i].ready_cond);
			
 
				+
			
 
				+		starpu_pthread_mutex_init(&workers[i].sched_mutex,NULL);
			
 
				+		starpu_pthread_mutex_destroy(&workers[i].sched_mutex);
			
 
				+
			
 
				+		starpu_pthread_cond_init(&workers[i].sched_cond,NULL);
			
 
				+		starpu_pthread_cond_destroy(&workers[i].sched_cond);
			
 
				 
			
 
				-    struct _starpu_mp_transfer_command_to_device *cmd = (struct _starpu_mp_transfer_command_to_device *)arg;
			
 
				+		workers[i].current_task = NULL;
			
 
				+		workers[i].set = NULL;
			
 
				+		workers[i].terminated_jobs = NULL;
			
 
				+	
			
 
				+		//_starpu_barrier_counter_init(&workers[i].tasks_barrier, 1);
			
 
				+		//_starpu_barrier_counter_destroy(&workers[i].tasks_barrier);
			
 
				 
			
 
				-    mp_node->dt_send_to_device(mp_node, cmd->devid, cmd->addr, cmd->size);
			
 
				+		starpu_pthread_mutex_init(&workers[i].parallel_sect_mutex,NULL);
			
 
				+		starpu_pthread_mutex_destroy(&workers[i].parallel_sect_mutex);
			
 
				+
			
 
				+		starpu_pthread_cond_init(&workers[i].parallel_sect_cond,NULL);
			
 
				+		starpu_pthread_cond_destroy(&workers[i].parallel_sect_cond);
			
 
				+
			
 
				+	}
			
 
				+
			
 
				+	/* Retrieve combined workers */
			
 
				+	struct _starpu_combined_worker * combined_workers = config->combined_workers; 
			
 
				+	node->dt_recv(node, combined_workers, combined_worker_size);
			
 
				+
			
 
				+	node->baseworkerid = baseworkerid;
			
 
				+	STARPU_PTHREAD_BARRIER_WAIT(&node->init_completed_barrier);	
			
 
				 }
			
 
				 
			
 
				+
			
 
				+
			
 
				 /* Function looping on the sink, waiting for tasks to execute.
			
 
				  * If the caller is the host, don't do anything.
			
 
				  */
			
 
				-
			
 
				 void _starpu_sink_common_worker(void)
			
 
				 {
			
 
				 	struct _starpu_mp_node *node = NULL;
			
 
				 	enum _starpu_mp_command command = STARPU_EXIT;
			
 
				 	int arg_size = 0;
			
 
				 	void *arg = NULL;
			
 
				-
			
 
				+	int exit_starpu = 0;
			
 
				 	enum _starpu_mp_node_kind node_kind = _starpu_sink_common_get_kind();
			
 
				 
			
 
				 	if (node_kind == STARPU_INVALID_KIND)
			
@@ -234,46 +246,82 @@ void _starpu_sink_common_worker(void)
 
				 	/* Create and initialize the node */
			
 
				 	node = _starpu_mp_common_node_create(node_kind, -1);
			
 
				 
			
 
				-	while ((command = _starpu_mp_common_recv_command(node, &arg, &arg_size)) != STARPU_EXIT)
			
 
				+	starpu_pthread_key_t worker_key;
			
 
				+	STARPU_PTHREAD_KEY_CREATE(&worker_key, NULL);
			
 
				+
			
 
				+
			
 
				+	struct _starpu_machine_config *config;
			
 
				+	while (!exit_starpu)
			
 
				 	{
			
 
				-		switch(command)
			
 
				+		/* If we have received a message */
			
 
				+		if(node->mp_recv_is_ready(node))
			
 
				+		{
			
 
				+
			
 
				+			command = _starpu_mp_common_recv_command(node, &arg, &arg_size);
			
 
				+			switch(command)
			
 
				+			{
			
 
				+				case STARPU_EXIT:
			
 
				+					exit_starpu = 1;
			
 
				+					break;
			
 
				+				case STARPU_EXECUTE:
			
 
				+					config = _starpu_get_machine_config();
			
 
				+					node->execute(node, arg, arg_size);
			
 
				+					break;
			
 
				+				case STARPU_SINK_NBCORES:
			
 
				+					_starpu_sink_common_get_nb_cores(node);
			
 
				+					break;
			
 
				+				case STARPU_LOOKUP:
			
 
				+					_starpu_sink_common_lookup(node, (char *) arg);
			
 
				+					break;
			
 
				+
			
 
				+				case STARPU_ALLOCATE:
			
 
				+					node->allocate(node, arg, arg_size);
			
 
				+					break;
			
 
				+
			
 
				+				case STARPU_FREE:
			
 
				+					node->free(node, arg, arg_size);
			
 
				+					break;
			
 
				+
			
 
				+				case STARPU_RECV_FROM_HOST:
			
 
				+					_starpu_sink_common_copy_from_host(node, arg, arg_size);
			
 
				+					break;
			
 
				+
			
 
				+				case STARPU_SEND_TO_HOST:
			
 
				+					_starpu_sink_common_copy_to_host(node, arg, arg_size);
			
 
				+					break;
			
 
				+
			
 
				+				case STARPU_RECV_FROM_SINK:
			
 
				+					_starpu_sink_common_copy_from_sink(node, arg, arg_size);
			
 
				+					break;
			
 
				+
			
 
				+				case STARPU_SEND_TO_SINK:
			
 
				+					_starpu_sink_common_copy_to_sink(node, arg, arg_size);
			
 
				+					break;
			
 
				+
			
 
				+				case STARPU_SYNC_WORKERS:
			
 
				+					_starpu_sink_common_recv_workers(node, arg, arg_size);
			
 
				+					break;
			
 
				+				default:
			
 
				+					printf("Oops, command %x unrecognized\n", command);
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		STARPU_PTHREAD_MUTEX_LOCK(&node->message_queue_mutex);
			
 
				+		/* If the list is not empty */
			
 
				+		if(!mp_message_list_empty(node->message_queue))
			
 
				 		{
			
 
				-			case STARPU_EXECUTE:
			
 
				-				node->execute(node, arg, arg_size);
			
 
				-				break;
			
 
				-			case STARPU_SINK_NBCORES:
			
 
				-				node->nbcores (node);
			
 
				-				break;
			
 
				-			case STARPU_LOOKUP:
			
 
				-				_starpu_sink_common_lookup(node, (char *) arg);
			
 
				-				break;
			
 
				-
			
 
				-			case STARPU_ALLOCATE:
			
 
				-				node->allocate(node, arg, arg_size);
			
 
				-				break;
			
 
				-
			
 
				-			case STARPU_FREE:
			
 
				-				node->free(node, arg, arg_size);
			
 
				-				break;
			
 
				-
			
 
				-			case STARPU_RECV_FROM_HOST:
			
 
				-				_starpu_sink_common_copy_from_host(node, arg, arg_size);
			
 
				-				break;
			
 
				-
			
 
				-			case STARPU_SEND_TO_HOST:
			
 
				-				_starpu_sink_common_copy_to_host(node, arg, arg_size);
			
 
				-				break;
			
 
				-
			
 
				-			case STARPU_RECV_FROM_SINK:
			
 
				-				_starpu_sink_common_copy_from_sink(node, arg, arg_size);
			
 
				-				break;
			
 
				-
			
 
				-			case STARPU_SEND_TO_SINK:
			
 
				-				_starpu_sink_common_copy_to_sink(node, arg, arg_size);
			
 
				-				break;
			
 
				-
			
 
				-			default:
			
 
				-				printf("Oops, command %x unrecognized\n", command);
			
 
				+			/* We pop a message and send it to the host */
			
 
				+			struct mp_message * message = mp_message_list_pop_back(node->message_queue);
			
 
				+			STARPU_PTHREAD_MUTEX_UNLOCK(&node->message_queue_mutex);
			
 
				+			//_STARPU_DEBUG("telling host that we have finished the task %p sur %d.\n", task->kernel, task->coreid);
			
 
				+			config = _starpu_get_machine_config();
			
 
				+			_starpu_mp_common_send_command(node, message->type, 
			
 
				+					&message->buffer, message->size);
			
 
				+			mp_message_delete(message);
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			STARPU_PTHREAD_MUTEX_UNLOCK(&node->message_queue_mutex);
			
 
				 		}
			
 
				 	}
			
 
				 
			
@@ -282,3 +330,300 @@ void _starpu_sink_common_worker(void)
 
				 
			
 
				 	exit(0);
			
 
				 }
			
 
				+
			
 
				+
			
 
				+/* Search for the mp_barrier correspondind to the specified combined worker 
			
 
				+ * and create it if it doesn't exist
			
 
				+ */
			
 
				+static struct mp_barrier * _starpu_sink_common_get_barrier(struct _starpu_mp_node * node, int cb_workerid, int cb_workersize)
			
 
				+{
			
 
				+	struct mp_barrier * b = NULL;
			
 
				+	STARPU_PTHREAD_MUTEX_LOCK(&node->barrier_mutex);
			
 
				+	/* Search if the barrier already exist */
			
 
				+	for(b = mp_barrier_list_begin(node->barrier_list); 
			
 
				+			b != mp_barrier_list_end(node->barrier_list) && b->id != cb_workerid; 
			
 
				+			b = mp_barrier_list_next(b));
			
 
				+
			
 
				+	/* If we found the barrier */
			
 
				+	if(b != NULL)
			
 
				+	{
			
 
				+		STARPU_PTHREAD_MUTEX_UNLOCK(&node->barrier_mutex);
			
 
				+		return b;
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+
			
 
				+		/* Else we create, initialize and add it to the list*/
			
 
				+		b = mp_barrier_new();
			
 
				+		b->id = cb_workerid;
			
 
				+		STARPU_PTHREAD_BARRIER_INIT(&b->before_work_barrier,NULL,cb_workersize);
			
 
				+		STARPU_PTHREAD_BARRIER_INIT(&b->after_work_barrier,NULL,cb_workersize);
			
 
				+		mp_barrier_list_push_back(node->barrier_list,b);
			
 
				+		STARPU_PTHREAD_MUTEX_UNLOCK(&node->barrier_mutex);
			
 
				+		return b;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/* Erase for the mp_barrier correspondind to the specified combined worker
			
 
				+*/
			
 
				+static void _starpu_sink_common_erase_barrier(struct _starpu_mp_node * node, struct mp_barrier *barrier)
			
 
				+{
			
 
				+	STARPU_PTHREAD_MUTEX_LOCK(&node->barrier_mutex);
			
 
				+	mp_barrier_list_erase(node->barrier_list,barrier);
			
 
				+	STARPU_PTHREAD_MUTEX_UNLOCK(&node->barrier_mutex);
			
 
				+}
			
 
				+
			
 
				+/* Append the message given in parameter to the message list
			
 
				+ */
			
 
				+static void _starpu_sink_common_append_message(struct _starpu_mp_node *node, struct mp_message * message)
			
 
				+{
			
 
				+	struct _starpu_machine_config *config = _starpu_get_machine_config();
			
 
				+	STARPU_PTHREAD_MUTEX_LOCK(&node->message_queue_mutex);
			
 
				+	mp_message_list_push_front(node->message_queue,message);
			
 
				+	STARPU_PTHREAD_MUTEX_UNLOCK(&node->message_queue_mutex);
			
 
				+
			
 
				+}
			
 
				+/* Append to the message list a "STARPU_PRE_EXECUTION" message
			
 
				+ */
			
 
				+static void _starpu_sink_common_pre_execution_message(struct _starpu_mp_node *node, struct mp_task *task)
			
 
				+{
			
 
				+	/* Init message to tell the sink that the execution has begun */
			
 
				+	struct mp_message * message = mp_message_new();
			
 
				+	message->type = STARPU_PRE_EXECUTION;
			
 
				+	*(int *) message->buffer = task->combined_workerid;
			
 
				+	message->size = sizeof(task->combined_workerid);
			
 
				+
			
 
				+
			
 
				+	/* Append the message to the queue */	
			
 
				+	_starpu_sink_common_append_message(node, message);
			
 
				+
			
 
				+}
			
 
				+
			
 
				+/* Append to the message list a "STARPU_EXECUTION_COMPLETED" message
			
 
				+ */
			
 
				+static void _starpu_sink_common_execution_completed_message(struct _starpu_mp_node *node, struct mp_task *task)
			
 
				+{
			
 
				+	/* Init message to tell the sink that the execution is completed */
			
 
				+	struct mp_message * message = mp_message_new();
			
 
				+	message->type = STARPU_EXECUTION_COMPLETED;
			
 
				+	message->size = sizeof(task->coreid);
			
 
				+	*(int*) message->buffer = task->coreid;
			
 
				+
			
 
				+	/* Append the message to the queue */
			
 
				+	_starpu_sink_common_append_message(node, message);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/* Bind the thread which is running on the specified core to the combined worker */
			
 
				+static void _starpu_sink_common_bind_to_combined_worker(struct _starpu_mp_node *node, int coreid, struct _starpu_combined_worker * combined_worker)
			
 
				+{
			
 
				+	int i;
			
 
				+	int * bind_set = malloc(sizeof(int)*combined_worker->worker_size);
			
 
				+	for(i=0;i<combined_worker->worker_size;i++)
			
 
				+		bind_set[i] = combined_worker->combined_workerid[i] - node->baseworkerid;
			
 
				+	node->bind_thread(node, coreid, bind_set, combined_worker->worker_size);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+/* Get the current rank of the worker in the combined worker 
			
 
				+ */
			
 
				+static int _starpu_sink_common_get_current_rank(int workerid, struct _starpu_combined_worker * combined_worker)
			
 
				+{
			
 
				+	int i;
			
 
				+	for(i=0; i<combined_worker->worker_size; i++)
			
 
				+		if(workerid == combined_worker->combined_workerid[i])
			
 
				+			return i;
			
 
				+
			
 
				+	STARPU_ASSERT(0);
			
 
				+}
			
 
				+
			
 
				+/* Execute the task 
			
 
				+ */
			
 
				+static void _starpu_sink_common_execute_kernel(struct _starpu_mp_node *node, int coreid, struct _starpu_worker * worker)
			
 
				+{
			
 
				+	struct _starpu_combined_worker * combined_worker = NULL;
			
 
				+	struct mp_task* task = node->run_table[coreid];
			
 
				+
			
 
				+
			
 
				+	/* If it's a parallel task */
			
 
				+	if(task->is_parallel_task)
			
 
				+	{
			
 
				+		combined_worker = _starpu_get_combined_worker_struct(task->combined_workerid);
			
 
				+		
			
 
				+		worker->current_rank = _starpu_sink_common_get_current_rank(worker->workerid, combined_worker);
			
 
				+		worker->combined_workerid = task->combined_workerid;
			
 
				+		worker->worker_size = combined_worker->worker_size;
			
 
				+		
			
 
				+		/* Synchronize with others threads of the combined worker*/
			
 
				+		STARPU_PTHREAD_BARRIER_WAIT(&task->mp_barrier->before_work_barrier);
			
 
				+
			
 
				+
			
 
				+		/* The first thread of the combined worker */
			
 
				+		if(worker->current_rank == 0)
			
 
				+		{
			
 
				+			/* tell the sink that the execution has begun */
			
 
				+			_starpu_sink_common_pre_execution_message(node,task);
			
 
				+
			
 
				+			/* If the mode is FORKJOIN, 
			
 
				+			 * the first thread binds himself 
			
 
				+			 * on all core of the combined worker*/
			
 
				+			if(task->type == STARPU_FORKJOIN)
			
 
				+			{
			
 
				+				_starpu_sink_common_bind_to_combined_worker(node, coreid, combined_worker);
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		worker->current_rank = 0;
			
 
				+		worker->combined_workerid = 0;
			
 
				+		worker->worker_size = 1;
			
 
				+	}
			
 
				+	if(task->type != STARPU_FORKJOIN || worker->current_rank == 0)
			
 
				+	{
			
 
				+		/* execute the task */
			
 
				+		task->kernel(task->interfaces,task->cl_arg);
			
 
				+	}
			
 
				+
			
 
				+	/* If it's a parallel task */
			
 
				+	if(task->is_parallel_task)
			
 
				+	{
			
 
				+		/* Synchronize with others threads of the combined worker*/
			
 
				+		STARPU_PTHREAD_BARRIER_WAIT(&task->mp_barrier->after_work_barrier);
			
 
				+
			
 
				+		/* The fisrt thread of the combined */
			
 
				+		if(worker->current_rank == 0)
			
 
				+		{
			
 
				+			/* Erase the barrier from the list */
			
 
				+			_starpu_sink_common_erase_barrier(node,task->mp_barrier);
			
 
				+
			
 
				+			/* If the mode is FORKJOIN, 
			
 
				+			 * the first thread rebinds himself on his own core */
			
 
				+			if(task->type == STARPU_FORKJOIN)
			
 
				+				node->bind_thread(node, coreid, &coreid, 1);
			
 
				+
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	node->run_table[coreid] = NULL;
			
 
				+
			
 
				+	/* tell the sink that the execution is completed */
			
 
				+	_starpu_sink_common_execution_completed_message(node,task);
			
 
				+
			
 
				+	/*free the task*/
			
 
				+	unsigned i;
			
 
				+	for (i = 0; i < task->nb_interfaces; i++)
			
 
				+		free(task->interfaces[i]);
			
 
				+	free(task);
			
 
				+
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/* The main function executed by the thread 
			
 
				+ * thread_arg is a structure containing the information needed by the thread
			
 
				+ */
			
 
				+void* _starpu_sink_thread(void * thread_arg)
			
 
				+{
			
 
				+	/* Retrieve the information from the structure */
			
 
				+	struct _starpu_mp_node *node = ((struct arg_sink_thread *)thread_arg)->node;
			
 
				+	int coreid =((struct arg_sink_thread *)thread_arg)->coreid;
			
 
				+	/* free the structure */
			
 
				+	free(thread_arg);
			
 
				+
			
 
				+	STARPU_PTHREAD_BARRIER_WAIT(&node->init_completed_barrier);	
			
 
				+
			
 
				+	struct _starpu_worker *worker = &_starpu_get_machine_config()->workers[node->baseworkerid + coreid];
			
 
				+
			
 
				+	_starpu_set_local_worker_key(worker);
			
 
				+	while(node->is_running)
			
 
				+	{
			
 
				+		/*Wait there is a task available */
			
 
				+		sem_wait(&node->sem_run_table[coreid]);
			
 
				+		if(node->run_table[coreid] != NULL)
			
 
				+			_starpu_sink_common_execute_kernel(node,coreid,worker);
			
 
				+
			
 
				+	}
			
 
				+	pthread_exit(NULL);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/* Add the task to the specific thread and wake him up
			
 
				+*/
			
 
				+static void _starpu_sink_common_execute_thread(struct _starpu_mp_node *node, struct mp_task *task)
			
 
				+{
			
 
				+	/* Add the task to the specific thread */
			
 
				+	node->run_table[task->coreid] = task;
			
 
				+	/* Unlock the mutex to wake up the thread which will execute the task */
			
 
				+	sem_post(&node->sem_run_table[task->coreid]);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+/* Receive paquet from _starpu_src_common_execute_kernel in the form below :
			
 
				+ * [Function pointer on sink, number of interfaces, interfaces
			
 
				+ * (union _starpu_interface), cl_arg]
			
 
				+ * Then call the function given, passing as argument an array containing the
			
 
				+ * addresses of the received interfaces
			
 
				+ */
			
 
				+
			
 
				+void _starpu_sink_common_execute(struct _starpu_mp_node *node,
			
 
				+		void *arg, int arg_size)
			
 
				+{
			
 
				+	unsigned i;
			
 
				+
			
 
				+	void *arg_ptr = arg;
			
 
				+	struct mp_task *task = malloc(sizeof(struct mp_task));
			
 
				+
			
 
				+	task->kernel = *(void(**)(void **, void *)) arg_ptr;
			
 
				+	arg_ptr += sizeof(task->kernel);
			
 
				+
			
 
				+	task->type = *(enum starpu_codelet_type *) arg_ptr;
			
 
				+	arg_ptr += sizeof(task->type);
			
 
				+
			
 
				+	task->is_parallel_task = *(int *) arg_ptr;
			
 
				+	arg_ptr += sizeof(task->is_parallel_task);
			
 
				+
			
 
				+	if(task->is_parallel_task)
			
 
				+	{
			
 
				+		task->combined_workerid= *(int *) arg_ptr;
			
 
				+		arg_ptr += sizeof(task->combined_workerid);
			
 
				+
			
 
				+		task->mp_barrier = _starpu_sink_common_get_barrier(node,task->combined_workerid,_starpu_get_combined_worker_struct(task->combined_workerid)->worker_size);
			
 
				+	}
			
 
				+
			
 
				+	task->coreid = *(unsigned *) arg_ptr;
			
 
				+	arg_ptr += sizeof(task->coreid);
			
 
				+
			
 
				+	task->nb_interfaces = *(unsigned *) arg_ptr;
			
 
				+	arg_ptr += sizeof(task->nb_interfaces);
			
 
				+
			
 
				+	/* The function needs an array pointing to each interface it needs
			
 
				+	 * during execution. As in sink-side there is no mean to know which
			
 
				+	 * kind of interface to expect, the array is composed of unions of
			
 
				+	 * interfaces, thus we expect the same size anyway */
			
 
				+	for (i = 0; i < task->nb_interfaces; i++)
			
 
				+	{
			
 
				+		union _starpu_interface * interface = malloc(sizeof(union _starpu_interface));   
			
 
				+		memcpy(interface, arg_ptr, 
			
 
				+				sizeof(union _starpu_interface));
			
 
				+		task->interfaces[i] = interface;
			
 
				+		arg_ptr += sizeof(union _starpu_interface);
			
 
				+	}
			
 
				+
			
 
				+	/* Was cl_arg sent ? */
			
 
				+	if (arg_size > arg_ptr - arg)
			
 
				+		task->cl_arg = arg_ptr;
			
 
				+	else
			
 
				+		task->cl_arg = NULL;
			
 
				+
			
 
				+
			
 
				+	//_STARPU_DEBUG("telling host that we have submitted the task %p.\n", task->kernel);
			
 
				+	_starpu_mp_common_send_command(node, STARPU_EXECUTION_SUBMITTED,
			
 
				+			NULL, 0);
			
 
				+
			
 
				+	//_STARPU_DEBUG("executing the task %p\n", task->kernel);
			
 
				+	_starpu_sink_common_execute_thread(node, task);	
			
 
				+
			
 
				+}
			
--- a/src/drivers/mp_common/sink_common.h
+++ b/src/drivers/mp_common/sink_common.h
@@ -32,14 +32,21 @@ struct _starpu_sink_topology
 
				 	unsigned nb_cpus;
			
 
				 };
			
 
				 
			
 
				+struct arg_sink_thread
			
 
				+{
			
 
				+	struct _starpu_mp_node *node;
			
 
				+	int coreid;
			
 
				+};
			
 
				+
			
 
				 void _starpu_sink_common_worker(void);
			
 
				 
			
 
				-void _starpu_sink_common_execute(const struct _starpu_mp_node *node, void *arg, int arg_size);
			
 
				-void _starpu_sink_nbcores (const struct _starpu_mp_node *node);
			
 
				+void _starpu_sink_common_execute(struct _starpu_mp_node *node, void *arg, int arg_size);
			
 
				 
			
 
				 void _starpu_sink_common_allocate(const struct _starpu_mp_node *mp_node, void *arg, int arg_size);
			
 
				 void _starpu_sink_common_free(const struct _starpu_mp_node *mp_node STARPU_ATTRIBUTE_UNUSED, void *arg, int arg_size);
			
 
				 
			
 
				+void* _starpu_sink_thread(void * thread_arg);
			
 
				+
			
 
				 #endif /* STARPU_USE_MP */
			
 
				 
			
 
				 
			
--- a/src/drivers/mp_common/source_common.c
+++ b/src/drivers/mp_common/source_common.c
@@ -19,28 +19,241 @@
 
				 #include <pthread.h>
			
 
				 
			
 
				 #include <starpu.h>
			
 
				+#include <core/task.h>
			
 
				+#include <core/sched_policy.h>
			
 
				+
			
 
				+#include <drivers/driver_common/driver_common.h>
			
 
				+
			
 
				+
			
 
				 #include <datawizard/coherency.h>
			
 
				 #include <datawizard/interfaces/data_interface.h>
			
 
				 #include <drivers/mp_common/mp_common.h>
			
 
				 
			
 
				-int
			
 
				-_starpu_src_common_sink_nbcores (const struct _starpu_mp_node *node, int *buf)
			
 
				+
			
 
				+/* Finalize the execution of a task by a worker*/
			
 
				+static int _starpu_src_common_finalize_job (struct _starpu_job *j, struct _starpu_worker *worker) 
			
 
				+{
			
 
				+	uint32_t mask = 0;
			
 
				+	int profiling = starpu_profiling_status_get();
			
 
				+	struct timespec codelet_end;
			
 
				+	_starpu_driver_end_job(worker, j, worker->perf_arch, &codelet_end, 0,
			
 
				+			profiling);
			
 
				+	
			
 
				+	int count = worker->current_rank;
			
 
				+
			
 
				+	/* If it's a combined worker, we check if it's the last one of his combined */
			
 
				+	if(j->task_size > 1)
			
 
				+	{
			
 
				+		struct _starpu_combined_worker * cb_worker = _starpu_get_combined_worker_struct(worker->combined_workerid); 
			
 
				+
			
 
				+		pthread_mutex_lock(&cb_worker->count_mutex);
			
 
				+		count = cb_worker->count--;
			
 
				+		if(count == 0)
			
 
				+			cb_worker->count = cb_worker->worker_size - 1; 
			
 
				+		pthread_mutex_unlock(&cb_worker->count_mutex);
			
 
				+	}
			
 
				+
			
 
				+	/* Finalize the execution */
			
 
				+	if(count == 0)
			
 
				+	{
			
 
				+
			
 
				+		_starpu_driver_update_job_feedback(j, worker, worker->perf_arch,
			
 
				+				&j->cl_start, &codelet_end,
			
 
				+				profiling);
			
 
				+
			
 
				+		_starpu_push_task_output (j, mask);
			
 
				+
			
 
				+		_starpu_handle_job_termination(j);
			
 
				+	}
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/* Complete the execution of the job */
			
 
				+static int _starpu_src_common_process_completed_job(struct _starpu_worker_set *workerset, void * arg, int arg_size)
			
 
				+{
			
 
				+	int coreid;
			
 
				+
			
 
				+	STARPU_ASSERT(sizeof(coreid) == arg_size);	
			
 
				+	
			
 
				+	coreid = *(int *) arg;
			
 
				+
			
 
				+	struct _starpu_worker *worker = &workerset->workers[coreid];
			
 
				+	struct _starpu_job *j = _starpu_get_job_associated_to_task(worker->current_task);
			
 
				+
			
 
				+	struct _starpu_worker * old_worker = _starpu_get_local_worker_key();
			
 
				+
			
 
				+	_starpu_set_local_worker_key(worker);
			
 
				+	_starpu_src_common_finalize_job (j, worker);
			
 
				+	_starpu_set_local_worker_key(old_worker);
			
 
				+
			
 
				+	worker->current_task = NULL;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/* Tell the scheduler when the execution has begun */
			
 
				+static void _starpu_src_common_pre_exec(void * arg, int arg_size)
			
 
				+{
			
 
				+	int cb_workerid, i;
			
 
				+	STARPU_ASSERT(sizeof(cb_workerid) == arg_size);
			
 
				+	cb_workerid = *(int *) arg;
			
 
				+	struct _starpu_combined_worker *combined_worker = _starpu_get_combined_worker_struct(cb_workerid);
			
 
				+	for(i=0; i < combined_worker->worker_size; i++)
			
 
				+	{
			
 
				+		struct _starpu_worker * worker = _starpu_get_worker_struct(combined_worker->combined_workerid[i]);
			
 
				+		_starpu_set_local_worker_key(worker);
			
 
				+		_starpu_sched_pre_exec_hook(worker->current_task);
			
 
				+	}	
			
 
				+}
			
 
				+
			
 
				+/* recv a message and handle asynchronous message
			
 
				+ * return 0 if the message has not been handle (it's certainly mean that it's a synchronous message)
			
 
				+ * return 1 if the message has been handle
			
 
				+ */
			
 
				+static int _starpu_src_common_handle_async(const struct _starpu_mp_node *node STARPU_ATTRIBUTE_UNUSED, 
			
 
				+		void * arg, int arg_size, 
			
 
				+		enum _starpu_mp_command answer)
			
 
				+{
			
 
				+	struct _starpu_worker_set * worker_set=NULL; 
			
 
				+	switch(answer) 
			
 
				+	{
			
 
				+		case STARPU_EXECUTION_COMPLETED:
			
 
				+			worker_set = _starpu_get_worker_struct(starpu_worker_get_id())->set;
			
 
				+			_starpu_src_common_process_completed_job(worker_set, arg, arg_size);
			
 
				+			break;
			
 
				+		case STARPU_PRE_EXECUTION:
			
 
				+			_starpu_src_common_pre_exec(arg,arg_size);
			
 
				+			break;
			
 
				+		default:
			
 
				+			return 0;
			
 
				+			break;
			
 
				+	}
			
 
				+	return 1;
			
 
				+}
			
 
				+
			
 
				+/* Handle all message which have been stored in the message_queue */
			
 
				+static void _starpu_src_common_handle_stored_async(struct _starpu_mp_node *node)
			
 
				+{
			
 
				+	STARPU_PTHREAD_MUTEX_LOCK(&node->message_queue_mutex);
			
 
				+	/* while the list is not empty */
			
 
				+	while(!mp_message_list_empty(node->message_queue))
			
 
				+	{
			
 
				+		/* We pop a message and handle it */
			
 
				+		struct mp_message * message = mp_message_list_pop_back(node->message_queue);
			
 
				+		_starpu_src_common_handle_async(node, message->buffer, 
			
 
				+				message->size, message->type);
			
 
				+		mp_message_delete(message);
			
 
				+	}
			
 
				+	STARPU_PTHREAD_MUTEX_UNLOCK(&node->message_queue_mutex);
			
 
				+}
			
 
				+
			
 
				+/* Store a message if is asynchronous 
			
 
				+ * return 1 if the message has been stored
			
 
				+ * return 0 if the message is unknown or synchrone */
			
 
				+int _starpu_src_common_store_message(struct _starpu_mp_node *node, 
			
 
				+		void * arg, int arg_size, enum _starpu_mp_command answer)
			
 
				+{
			
 
				+	struct mp_message * message = NULL;
			
 
				+	switch(answer)
			
 
				+	{
			
 
				+		case STARPU_EXECUTION_COMPLETED:
			
 
				+		case STARPU_PRE_EXECUTION:
			
 
				+			message = mp_message_new();
			
 
				+			message->type = answer;
			
 
				+			memcpy(message->buffer, arg, arg_size); 
			
 
				+			message->size = arg_size; 
			
 
				+
			
 
				+			STARPU_PTHREAD_MUTEX_LOCK(&node->message_queue_mutex);
			
 
				+			mp_message_list_push_front(node->message_queue,message);
			
 
				+			STARPU_PTHREAD_MUTEX_UNLOCK(&node->message_queue_mutex);
			
 
				+			return 1;
			
 
				+			break;
			
 
				+		default:
			
 
				+			return 0;
			
 
				+			break;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/* Store all asynchronous messages and return when a synchronous message is received */
			
 
				+static enum _starpu_mp_command _starpu_src_common_wait_command_sync(struct _starpu_mp_node *node, 
			
 
				+		void ** arg, int* arg_size)
			
 
				+{
			
 
				+	enum _starpu_mp_command answer;
			
 
				+	int is_sync = 0;
			
 
				+	while(!is_sync)
			
 
				+	{
			
 
				+		answer = _starpu_mp_common_recv_command(node, arg, arg_size);
			
 
				+		if(!_starpu_src_common_store_message(node,*arg,*arg_size,answer))
			
 
				+			is_sync=1;
			
 
				+	}
			
 
				+	return answer;
			
 
				+}
			
 
				+
			
 
				+/* Handle a asynchrone message and return a error if a synchronous message is received */
			
 
				+static void _starpu_src_common_recv_async(struct _starpu_mp_node * node)
			
 
				+{
			
 
				+	enum _starpu_mp_command answer;
			
 
				+	void *arg;
			
 
				+	int arg_size;
			
 
				+	answer = _starpu_mp_common_recv_command(node, &arg, &arg_size);
			
 
				+	if(!_starpu_src_common_handle_async(node,arg,arg_size,answer))
			
 
				+	{
			
 
				+		printf("incorrect commande: unknown command or sync command");
			
 
				+		STARPU_ASSERT(0);
			
 
				+	}	
			
 
				+}
			
 
				+
			
 
				+/* Handle all asynchrone message while a completed execution message from a specific worker has been receive */
			
 
				+ enum _starpu_mp_command _starpu_src_common_wait_completed_execution(struct _starpu_mp_node *node, int devid, void **arg, int * arg_size)
			
 
				+{
			
 
				+	enum _starpu_mp_command answer;
			
 
				+
			
 
				+	int completed = 0;	
			
 
				+	/*While the waited completed execution message has not been receive*/
			
 
				+	while(!completed)
			
 
				+	{
			
 
				+		answer = _starpu_mp_common_recv_command (node, arg, arg_size);
			
 
				+
			
 
				+		if(answer == STARPU_EXECUTION_COMPLETED)
			
 
				+		{
			
 
				+			int coreid;
			
 
				+			STARPU_ASSERT(sizeof(coreid) == *arg_size);	
			
 
				+			coreid = *(int *) *arg;
			
 
				+			if(devid == coreid)
			
 
				+				completed = 1;
			
 
				+			else
			
 
				+				if(!_starpu_src_common_store_message(node, *arg, *arg_size, answer))
			
 
				+					/* We receive a unknown or asynchronous message  */
			
 
				+					STARPU_ASSERT(0);
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			if(!_starpu_src_common_store_message(node, *arg, *arg_size, answer))
			
 
				+				/* We receive a unknown or asynchronous message  */
			
 
				+				STARPU_ASSERT(0);
			
 
				+		}
			
 
				+	}
			
 
				+	return answer;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/* Send a request to the sink NODE for the number of cores on it. */
			
 
				+int _starpu_src_common_sink_nbcores (const struct _starpu_mp_node *node, int *buf)
			
 
				 {
			
 
				-    // Send a request to the sink NODE for the number of cores on it.
			
 
				 
			
 
				-    enum _starpu_mp_command answer;
			
 
				-    void *arg;
			
 
				-    int arg_size = sizeof (int);
			
 
				+	enum _starpu_mp_command answer;
			
 
				+	void *arg;
			
 
				+	int arg_size = sizeof (int);
			
 
				 
			
 
				-    _starpu_mp_common_send_command (node, STARPU_SINK_NBCORES, NULL, 0);
			
 
				+	_starpu_mp_common_send_command (node, STARPU_SINK_NBCORES, NULL, 0);
			
 
				 
			
 
				-    answer = _starpu_mp_common_recv_command (node, &arg, &arg_size);
			
 
				+	answer = _starpu_mp_common_recv_command (node, &arg, &arg_size);
			
 
				 
			
 
				-    STARPU_ASSERT (answer == STARPU_ANSWER_SINK_NBCORES && arg_size == sizeof (int));
			
 
				+	STARPU_ASSERT (answer == STARPU_ANSWER_SINK_NBCORES && arg_size == sizeof (int));
			
 
				 
			
 
				-    memcpy (buf, arg, arg_size);
			
 
				+	memcpy (buf, arg, arg_size);
			
 
				 
			
 
				-    return 0;
			
 
				+	return 0;
			
 
				 }
			
 
				 
			
 
				 /* Send a request to the sink linked to NODE for the pointer to the
			
@@ -49,7 +262,7 @@ _starpu_src_common_sink_nbcores (const struct _starpu_mp_node *node, int *buf)
 
				  * else it returns -ESPIPE if the function was not found.
			
 
				  */
			
 
				 int _starpu_src_common_lookup(struct _starpu_mp_node *node,
			
 
				-			      void (**func_ptr)(void), const char *func_name)
			
 
				+		void (**func_ptr)(void), const char *func_name)
			
 
				 {
			
 
				 	enum _starpu_mp_command answer;
			
 
				 	void *arg;
			
@@ -60,19 +273,21 @@ int _starpu_src_common_lookup(struct _starpu_mp_node *node,
 
				 
			
 
				 	//_STARPU_DEBUG("Looking up %s\n", func_name);
			
 
				 	_starpu_mp_common_send_command(node, STARPU_LOOKUP, (void *) func_name,
			
 
				-				       arg_size);
			
 
				-	answer = _starpu_mp_common_recv_command(node, (void **) &arg,
			
 
				-						&arg_size);
			
 
				+			arg_size);
			
 
				+
			
 
				+	answer = _starpu_src_common_wait_command_sync(node, (void **) &arg,
			
 
				+			&arg_size);
			
 
				 
			
 
				-	if (answer == STARPU_ERROR_LOOKUP) {
			
 
				+	if (answer == STARPU_ERROR_LOOKUP) 
			
 
				+	{
			
 
				 		_STARPU_DISP("Error looking up symbol %s\n", func_name);
			
 
				 		return -ESPIPE;
			
 
				 	}
			
 
				 
			
 
				 	/* We have to be sure the device answered the right question and the
			
 
				 	 * answer has the right size */
			
 
				-	STARPU_ASSERT(answer == STARPU_ANSWER_LOOKUP &&
			
 
				-		      arg_size == sizeof(*func_ptr));
			
 
				+	STARPU_ASSERT(answer == STARPU_ANSWER_LOOKUP);
			
 
				+	STARPU_ASSERT(arg_size == sizeof(*func_ptr));
			
 
				 
			
 
				 	memcpy(func_ptr, arg, arg_size);
			
 
				 
			
@@ -81,32 +296,46 @@ int _starpu_src_common_lookup(struct _starpu_mp_node *node,
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				- /* Send a message to the sink to execute a kernel.
			
 
				+/* Send a message to the sink to execute a kernel.
			
 
				  * The message sent has the form below :
			
 
				  * [Function pointer on sink, number of interfaces, interfaces
			
 
				  * (union _starpu_interface), cl_arg]
			
 
				  */
			
 
				-int _starpu_src_common_execute_kernel(const struct _starpu_mp_node *node,
			
 
				-				      void (*kernel)(void), unsigned coreid,
			
 
				-				      starpu_data_handle_t *handles,
			
 
				-				      void **interfaces,
			
 
				-				      unsigned nb_interfaces,
			
 
				-				      void *cl_arg, size_t cl_arg_size)
			
 
				+/* Launch the execution of the function KERNEL points to on the sink linked
			
 
				+ * to NODE. Returns 0 in case of success, -EINVAL if kernel is an invalid
			
 
				+ * pointer.
			
 
				+ * Data interfaces in task are send to the sink.
			
 
				+ */
			
 
				+int _starpu_src_common_execute_kernel(struct _starpu_mp_node *node,
			
 
				+		void (*kernel)(void), unsigned coreid,
			
 
				+		enum starpu_codelet_type type,
			
 
				+		int is_parallel_task, int cb_workerid,
			
 
				+		starpu_data_handle_t *handles,
			
 
				+		void **interfaces,
			
 
				+		unsigned nb_interfaces,
			
 
				+		void *cl_arg, size_t cl_arg_size)
			
 
				 {
			
 
				-	unsigned id;
			
 
				-	void *buffer, *buffer_ptr, *arg = NULL;
			
 
				-	int buffer_size = 0, arg_size = 0;
			
 
				+
			
 
				+	void *buffer, *buffer_ptr, *arg =NULL;
			
 
				+	int buffer_size = 0, arg_size =0;
			
 
				+	unsigned i;
			
 
				+
			
 
				+	buffer_size = sizeof(kernel) + sizeof(coreid) + sizeof(type)
			
 
				+		+ sizeof(nb_interfaces) + nb_interfaces * sizeof(union _starpu_interface) + sizeof(is_parallel_task);
			
 
				+	
			
 
				+	/*if the task is paralle*/
			
 
				+	if(is_parallel_task)
			
 
				+	{
			
 
				+		buffer_size += sizeof(cb_workerid); 
			
 
				+	}
			
 
				 
			
 
				 	/* If the user didn't give any cl_arg, there is no need to send it */
			
 
				-	buffer_size =
			
 
				-	    sizeof(kernel) + sizeof(coreid) + sizeof(nb_interfaces) +
			
 
				-	    nb_interfaces * sizeof(union _starpu_interface);
			
 
				 	if (cl_arg)
			
 
				 	{
			
 
				 		STARPU_ASSERT(cl_arg_size);
			
 
				 		buffer_size += cl_arg_size;
			
 
				 	}
			
 
				-
			
 
				+	
			
 
				 	/* We give to send_command a buffer we just allocated, which contains
			
 
				 	 * a pointer to the function (sink-side), core on which execute this
			
 
				 	 * function (sink-side), number of interfaces we send,
			
@@ -116,6 +345,18 @@ int _starpu_src_common_execute_kernel(const struct _starpu_mp_node *node,
 
				 	*(void(**)(void)) buffer = kernel;
			
 
				 	buffer_ptr += sizeof(kernel);
			
 
				 
			
 
				+	*(enum starpu_codelet_type *) buffer_ptr = type;
			
 
				+	buffer_ptr += sizeof(type);
			
 
				+
			
 
				+	*(int *) buffer_ptr = is_parallel_task;
			
 
				+	buffer_ptr += sizeof(is_parallel_task);
			
 
				+
			
 
				+	if(is_parallel_task)
			
 
				+	{
			
 
				+		*(int *) buffer_ptr = cb_workerid ;
			
 
				+		buffer_ptr += sizeof(cb_workerid);
			
 
				+	}
			
 
				+
			
 
				 	*(unsigned *) buffer_ptr = coreid;
			
 
				 	buffer_ptr += sizeof(coreid);
			
 
				 
			
@@ -126,11 +367,13 @@ int _starpu_src_common_execute_kernel(const struct _starpu_mp_node *node,
 
				 	 * executed on a sink with a different memory, whereas a codelet is
			
 
				 	 * executed on the host part for the other accelerators.
			
 
				 	 * Thus we need to send a copy of each interface on the MP device */
			
 
				-	for (id = 0; id < nb_interfaces; id++)
			
 
				+
			
 
				+	for (i = 0; i < nb_interfaces; i++)
			
 
				 	{
			
 
				-		starpu_data_handle_t handle = handles[id];
			
 
				-		memcpy (buffer_ptr, interfaces[id],
			
 
				-			handle->ops->interface_size);
			
 
				+		starpu_data_handle_t handle = handles[i];
			
 
				+
			
 
				+		memcpy (buffer_ptr, interfaces[i],
			
 
				+				handle->ops->interface_size);
			
 
				 		/* The sink side has no mean to get the type of each
			
 
				 		 * interface, we use a union to make it generic and permit the
			
 
				 		 * sink to go through the array */
			
@@ -141,7 +384,7 @@ int _starpu_src_common_execute_kernel(const struct _starpu_mp_node *node,
 
				 		memcpy(buffer_ptr, cl_arg, cl_arg_size);
			
 
				 
			
 
				 	_starpu_mp_common_send_command(node, STARPU_EXECUTE, buffer, buffer_size);
			
 
				-	enum _starpu_mp_command answer = _starpu_mp_common_recv_command(node, &arg, &arg_size);
			
 
				+	enum _starpu_mp_command answer = _starpu_src_common_wait_command_sync(node, &arg, &arg_size);
			
 
				 
			
 
				 	if (answer == STARPU_ERROR_EXECUTE)
			
 
				 		return -EINVAL;
			
@@ -151,46 +394,76 @@ int _starpu_src_common_execute_kernel(const struct _starpu_mp_node *node,
 
				 	free(buffer);
			
 
				 
			
 
				 	return 0;
			
 
				-
			
 
				 }
			
 
				 
			
 
				-/* Launch the execution of the function KERNEL points to on the sink linked
			
 
				- * to NODE. Returns 0 in case of success, -EINVAL if kernel is an invalid
			
 
				- * pointer.
			
 
				- * Data interfaces in task are send to the sink.
			
 
				- */
			
 
				-int _starpu_src_common_execute_kernel_from_task(const struct _starpu_mp_node *node,
			
 
				-						void (*kernel)(void), unsigned coreid,
			
 
				-						struct starpu_task *task)
			
 
				+
			
 
				+/* Get the information and call the function to send to the sink a message to execute the task*/
			
 
				+static int _starpu_src_common_execute(struct _starpu_job *j, 
			
 
				+		struct _starpu_worker *worker, 
			
 
				+		struct _starpu_mp_node * node)
			
 
				 {
			
 
				-    return _starpu_src_common_execute_kernel(node, kernel, coreid,
			
 
				-					     task->handles, task->interfaces, task->cl->nbuffers,
			
 
				-					     task->cl_arg, task->cl_arg_size);
			
 
				+	int ret;
			
 
				+	uint32_t mask = 0;
			
 
				+
			
 
				+	STARPU_ASSERT(j);
			
 
				+	struct starpu_task *task = j->task;
			
 
				+
			
 
				+	int profiling = starpu_profiling_status_get();
			
 
				+
			
 
				+	STARPU_ASSERT(task);
			
 
				+	if (worker->current_rank == 0) 
			
 
				+	{
			
 
				+		ret = _starpu_fetch_task_input(j, mask);
			
 
				+		if (ret != 0)
			
 
				+		{
			
 
				+			/* there was not enough memory, so the input of
			
 
				+			 * the codelet cannot be fetched ... put the
			
 
				+			 * codelet back, and try it later */
			
 
				+			return -EAGAIN;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	void (*kernel)(void)  = node->get_kernel_from_job(node,j);
			
 
				+
			
 
				+	_starpu_driver_start_job(worker, j, &j->cl_start, 0, profiling);
			
 
				+
			
 
				+
			
 
				+	//_STARPU_DEBUG("\nworkerid:%d, rank:%d, type:%d,	cb_workerid:%d, task_size:%d\n\n",worker->devid,worker->current_rank,task->cl->type,j->combined_workerid,j->task_size);
			
 
				+
			
 
				+	_starpu_src_common_execute_kernel(node, kernel, worker->devid, task->cl->type,
			
 
				+			(j->task_size > 1),
			
 
				+			j->combined_workerid, task->handles,
			
 
				+			task->interfaces, task->cl->nbuffers,
			
 
				+			task->cl_arg, task->cl_arg_size);
			
 
				+
			
 
				+
			
 
				+	return 0;
			
 
				 }
			
 
				 
			
 
				+
			
 
				 /* Send a request to the sink linked to the MP_NODE to allocate SIZE bytes on
			
 
				  * the sink.
			
 
				  * In case of success, it returns 0 and *ADDR contains the address of the
			
 
				  * allocated area ;
			
 
				  * else it returns 1 if the allocation fail.
			
 
				  */
			
 
				-int _starpu_src_common_allocate(const struct _starpu_mp_node *mp_node,
			
 
				-								void **addr, size_t size)
			
 
				+int _starpu_src_common_allocate(struct _starpu_mp_node *mp_node,
			
 
				+		void **addr, size_t size)
			
 
				 {
			
 
				 	enum _starpu_mp_command answer;
			
 
				 	void *arg;
			
 
				 	int arg_size;
			
 
				 
			
 
				 	_starpu_mp_common_send_command(mp_node, STARPU_ALLOCATE, &size,
			
 
				-								   sizeof(size));
			
 
				+			sizeof(size));
			
 
				 
			
 
				-	answer = _starpu_mp_common_recv_command(mp_node, &arg, &arg_size);
			
 
				+	answer = _starpu_src_common_wait_command_sync(mp_node, &arg, &arg_size);
			
 
				 
			
 
				 	if (answer == STARPU_ERROR_ALLOCATE)
			
 
				 		return 1;
			
 
				 
			
 
				 	STARPU_ASSERT(answer == STARPU_ANSWER_ALLOCATE &&
			
 
				-				  arg_size == sizeof(*addr));
			
 
				+			arg_size == sizeof(*addr));
			
 
				 
			
 
				 	memcpy(addr, arg, arg_size);
			
 
				 
			
@@ -201,15 +474,15 @@ int _starpu_src_common_allocate(const struct _starpu_mp_node *mp_node,
 
				  * area pointed by ADDR.
			
 
				  */
			
 
				 void _starpu_src_common_free(const struct _starpu_mp_node *mp_node,
			
 
				-							 void *addr)
			
 
				+		void *addr)
			
 
				 {
			
 
				 	_starpu_mp_common_send_command(mp_node, STARPU_FREE, &addr, sizeof(addr));
			
 
				 }
			
 
				 
			
 
				 /* Send SIZE bytes pointed by SRC to DST on the sink linked to the MP_NODE.
			
 
				- */
			
 
				+*/
			
 
				 int _starpu_src_common_copy_host_to_sink(const struct _starpu_mp_node *mp_node,
			
 
				-										 void *src, void *dst, size_t size)
			
 
				+		void *src, void *dst, size_t size)
			
 
				 {
			
 
				 	struct _starpu_mp_transfer_command cmd = {size, dst};
			
 
				 
			
@@ -220,9 +493,9 @@ int _starpu_src_common_copy_host_to_sink(const struct _starpu_mp_node *mp_node,
 
				 }
			
 
				 
			
 
				 /* Receive SIZE bytes pointed by SRC on the sink linked to the MP_NODE and store them in DST.
			
 
				- */
			
 
				+*/
			
 
				 int _starpu_src_common_copy_sink_to_host(const struct _starpu_mp_node *mp_node,
			
 
				-										 void *src, void *dst, size_t size)
			
 
				+		void *src, void *dst, size_t size)
			
 
				 {
			
 
				 	struct _starpu_mp_transfer_command cmd = {size, src};
			
 
				 
			
@@ -265,8 +538,8 @@ int _starpu_src_common_copy_sink_to_sink(const struct _starpu_mp_node *src_node,
 
				 /* 5 functions to determine the executable to run on the device (MIC, SCC,
			
 
				  * MPI).
			
 
				  */
			
 
				-static void _starpu_src_common_cat_3(char *final, const char *first, const char *second,
			
 
				-										  const char *third)
			
 
				+static void _starpu_src_common_cat_3(char *final, const char *first, 
			
 
				+		const char *second, const char *third)
			
 
				 {
			
 
				 	strcpy(final, first);
			
 
				 	strcat(final, second);
			
@@ -304,9 +577,9 @@ static int _starpu_src_common_test_suffixes(char *located_file_name, const char
 
				 }
			
 
				 
			
 
				 int _starpu_src_common_locate_file(char *located_file_name,
			
 
				-							const char *env_file_name, const char *env_mic_path,
			
 
				-							const char *config_file_name, const char *actual_file_name,
			
 
				-							const char **suffixes)
			
 
				+		const char *env_file_name, const char *env_mic_path,
			
 
				+		const char *config_file_name, const char *actual_file_name,
			
 
				+		const char **suffixes)
			
 
				 {
			
 
				 	if (env_file_name != NULL)
			
 
				 	{
			
@@ -372,3 +645,98 @@ int _starpu_src_common_locate_file(char *located_file_name,
 
				 
			
 
				 	return 1;
			
 
				 }
			
 
				+
			
 
				+/* Send workers to the sink node 
			
 
				+ */
			
 
				+static void _starpu_src_common_send_workers(struct _starpu_mp_node * node, int baseworkerid, int nworkers)
			
 
				+{	
			
 
				+	struct _starpu_machine_config *config = _starpu_get_machine_config();
			
 
				+	int worker_size = sizeof(struct _starpu_worker)*nworkers;	
			
 
				+	int combined_worker_size = STARPU_NMAX_COMBINEDWORKERS*sizeof(struct _starpu_combined_worker);
			
 
				+	int msg[5];
			
 
				+	msg[0] = nworkers;
			
 
				+	msg[1] = worker_size;
			
 
				+	msg[2] = combined_worker_size;
			
 
				+	msg[3] = baseworkerid;
			
 
				+	msg[4] = starpu_worker_get_count();
			
 
				+
			
 
				+	/* tell the sink node that we will send him all workers */
			
 
				+	_starpu_mp_common_send_command(node, STARPU_SYNC_WORKERS, 
			
 
				+			&msg, sizeof(msg));
			
 
				+
			
 
				+	/* Send all worker to the sink node */
			
 
				+	node->dt_send(node,&config->workers[baseworkerid],worker_size);
			
 
				+
			
 
				+	/* Send all combined workers to the sink node */
			
 
				+	node->dt_send(node, &config->combined_workers,combined_worker_size);
			
 
				+}	
			
 
				+
			
 
				+/* Function looping on the source node */
			
 
				+void _starpu_src_common_worker(struct _starpu_worker_set * worker_set, 
			
 
				+		unsigned baseworkerid, 
			
 
				+		struct _starpu_mp_node * mp_node)
			
 
				+{ 
			
 
				+	unsigned memnode = worker_set->workers[0].memory_node;
			
 
				+	struct starpu_task **tasks = malloc(sizeof(struct starpu_task *)*worker_set->nworkers);
			
 
				+
			
 
				+	_starpu_src_common_send_workers(mp_node, baseworkerid, worker_set->nworkers);
			
 
				+
			
 
				+	/*main loop*/
			
 
				+	while (_starpu_machine_is_running())
			
 
				+	{
			
 
				+		int res;
			
 
				+		struct _starpu_job * j;
			
 
				+
			
 
				+		_STARPU_TRACE_START_PROGRESS(memnode);
			
 
				+		_starpu_datawizard_progress(memnode, 1);
			
 
				+		_STARPU_TRACE_END_PROGRESS(memnode);
			
 
				+
			
 
				+		/* Handle message which have been store */
			
 
				+		_starpu_src_common_handle_stored_async(mp_node);
			
 
				+
			
 
				+		/* poll the device for completed jobs.*/
			
 
				+		while(mp_node->mp_recv_is_ready(mp_node))
			
 
				+			_starpu_src_common_recv_async(mp_node);
			
 
				+
			
 
				+		/* get task for each worker*/
			
 
				+		res = _starpu_get_multi_worker_task(worker_set->workers, tasks, worker_set->nworkers);
			
 
				+
			
 
				+		/*if at least one worker have pop a task*/
			
 
				+		if(res != 0)
			
 
				+		{
			
 
				+			unsigned i;
			
 
				+			for(i=0; i<worker_set->nworkers; i++)
			
 
				+			{
			
 
				+				if(tasks[i] != NULL)
			
 
				+				{
			
 
				+					j = _starpu_get_job_associated_to_task(tasks[i]);
			
 
				+					_starpu_set_local_worker_key(&worker_set->workers[i]);
			
 
				+					res =  _starpu_src_common_execute(j, &worker_set->workers[i], mp_node);
			
 
				+					switch (res)
			
 
				+					{
			
 
				+						case 0:
			
 
				+							/* The task task has been launched with no error */
			
 
				+							break;
			
 
				+						case -EAGAIN:
			
 
				+							_STARPU_DISP("ouch, Xeon Phi could not actually run task %p, putting it back...\n", tasks[i]);
			
 
				+							_starpu_push_task_to_workers(tasks[i]);
			
 
				+							STARPU_ABORT();
			
 
				+							continue;
			
 
				+							break;
			
 
				+						default:
			
 
				+							STARPU_ASSERT(0);
			
 
				+					}
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	free(tasks);
			
 
				+
			
 
				+	_starpu_handle_all_pending_node_data_requests(memnode);
			
 
				+
			
 
				+	/* In case there remains some memory that was automatically
			
 
				+	 * allocated by StarPU, we release it now. Note that data
			
 
				+	 * coherency is not maintained anymore at that point ! */
			
 
				+	_starpu_free_all_automatically_allocated_buffers(memnode);
			
 
				+
			
 
				+}
			
--- a/src/drivers/mp_common/source_common.h
+++ b/src/drivers/mp_common/source_common.h
@@ -21,28 +21,42 @@
 
				 
			
 
				 #ifdef STARPU_USE_MP
			
 
				 
			
 
				+#include <core/sched_policy.h>
			
 
				+#include <core/task.h>
			
 
				 #include <drivers/mp_common/mp_common.h>
			
 
				 
			
 
				+
			
 
				+enum _starpu_mp_command _starpu_src_common_wait_command_sync(struct _starpu_mp_node *node, 
			
 
				+							     void ** arg, int* arg_size);
			
 
				+void _starpu_src_common_recv_async(struct _starpu_worker_set *worker_set, 
			
 
				+				   struct _starpu_mp_node * baseworker_node);
			
 
				+
			
 
				+int _starpu_src_common_store_message(struct _starpu_mp_node *node, 
			
 
				+		void * arg, int arg_size, enum _starpu_mp_command answer);
			
 
				+
			
 
				+enum _starpu_mp_command _starpu_src_common_wait_completed_execution(struct _starpu_mp_node *node, int devid, void **arg, int * arg_size);
			
 
				+
			
 
				 int _starpu_src_common_sink_nbcores (const struct _starpu_mp_node *node, int *buf);
			
 
				 
			
 
				 int _starpu_src_common_lookup(const struct _starpu_mp_node *node,
			
 
				 			      void (**func_ptr)(void), const char *func_name);
			
 
				 
			
 
				-int _starpu_src_common_execute_kernel(const struct _starpu_mp_node *node,
			
 
				-				      void (*kernel)(void), unsigned coreid,
			
 
				-				      starpu_data_handle_t *handles, void **interfaces, unsigned nb_interfaces,
			
 
				-				      void *cl_arg, size_t cl_arg_size);
			
 
				-
			
 
				-int _starpu_src_common_execute_kernel_from_task(const struct _starpu_mp_node *node,
			
 
				-						void (*kernel)(void), unsigned coreid,
			
 
				-						struct starpu_task *task);
			
 
				-
			
 
				 int _starpu_src_common_allocate(const struct _starpu_mp_node *mp_node,
			
 
				 				void **addr, size_t size);
			
 
				 
			
 
				 void _starpu_src_common_free(const struct _starpu_mp_node *mp_node,
			
 
				 			     void *addr);
			
 
				 
			
 
				+int _starpu_src_common_execute_kernel(const struct _starpu_mp_node *node,
			
 
				+				      void (*kernel)(void), unsigned coreid,
			
 
				+				      enum starpu_codelet_type type,
			
 
				+				      int is_parallel_task, int cb_workerid,
			
 
				+				      starpu_data_handle_t *handles,
			
 
				+				      void **interfaces,
			
 
				+				      unsigned nb_interfaces,
			
 
				+				      void *cl_arg, size_t cl_arg_size);
			
 
				+
			
 
				+
			
 
				 int _starpu_src_common_copy_host_to_sink(const struct _starpu_mp_node *mp_node,
			
 
				 					 void *src, void *dst, size_t size);
			
 
				 
			
@@ -57,6 +71,11 @@ int _starpu_src_common_locate_file(char *located_file_name,
 
				 				   const char *config_file_name, const char *actual_file_name,
			
 
				 				   const char **suffixes);
			
 
				 
			
 
				+void _starpu_src_common_worker(struct _starpu_worker_set * worker_set, 
			
 
				+			       unsigned baseworkerid, 
			
 
				+			       struct _starpu_mp_node * node_set);
			
 
				+
			
 
				+
			
 
				 #endif /* STARPU_USE_MP */
			
 
				 
			
 
				 
			
--- a/src/drivers/scc/driver_scc_common.c
+++ b/src/drivers/scc/driver_scc_common.c
@@ -172,3 +172,11 @@ void _starpu_scc_common_report_rcce_error(const char *func, const char *file, co
 
				 	fprintf(stderr, "RCCE error in %s (%s:%d): %s\n", func, file, line, error_string); 
			
 
				 	STARPU_ABORT();
			
 
				 }
			
 
				+
			
 
				+int _starpu_scc_common_recv_is_ready(const struct _starpu_mp_node *mp_node)
			
 
				+{
			
 
				+  /***********
			
 
				+      TODO
			
 
				+  ************/
			
 
				+  STARPU_ASSERT(0);
			
 
				+}
			
--- a/src/drivers/scc/driver_scc_common.h
+++ b/src/drivers/scc/driver_scc_common.h
@@ -44,6 +44,8 @@ void _starpu_scc_common_recv(const struct _starpu_mp_node *node, void *msg, int
 
				 
			
 
				 void _starpu_scc_common_report_rcce_error(const char *func, const char *file, const int line, const int err_no);
			
 
				 
			
 
				+int _starpu_scc_common_recv_is_ready(const struct _starpu_mp_node *mp_node);
			
 
				+
			
 
				 #endif /* STARPU_USE_SCC */
			
 
				 
			
 
				 
			
--- a/src/drivers/scc/driver_scc_sink.c
+++ b/src/drivers/scc/driver_scc_sink.c
@@ -16,6 +16,7 @@
 
				 
			
 
				 
			
 
				 #include <RCCE.h>
			
 
				+#include <dlfcn.h>
			
 
				 
			
 
				 #include <datawizard/interfaces/data_interface.h>
			
 
				 #include <drivers/mp_common/sink_common.h>
			
@@ -27,6 +28,23 @@
 
				 void _starpu_scc_sink_init(struct _starpu_mp_node *node)
			
 
				 {
			
 
				 	node->mp_connection.scc_nodeid = _starpu_scc_common_get_src_node_id();
			
 
				+
			
 
				+	/****************
			
 
				+	 *     TODO     *
			
 
				+	 * get nb_cores *
			
 
				+	 ****************/
			
 
				+	node->nb_cores = 1; 
			
 
				+	STARPU_ASSERT(0);
			
 
				+
			
 
				+}
			
 
				+
			
 
				+void _starpu_scc_sink_launch_workers(struct _starpu_mp_node *node)
			
 
				+{
			
 
				+	/*****************
			
 
				+	 *     TODO      *
			
 
				+	 * init thread   *
			
 
				+	 *****************/
			
 
				+	STARPU_ASSERT(0);
			
 
				 }
			
 
				 
			
 
				 void _starpu_scc_sink_deinit(struct _starpu_mp_node *node)
			
@@ -51,6 +69,15 @@ void _starpu_scc_sink_recv_from_device(const struct _starpu_mp_node *node, int s
 
				 		STARPU_MP_COMMON_REPORT_ERROR(node, ret);
			
 
				 }
			
 
				 
			
 
				+void _starpu_scc_sink_bind_thread(const struct _starpu_mp_node *mp_node STARPU_ATTRIBUTE_UNUSED, cpu_set_t * cpuset, int coreid, pthread_t *thread)
			
 
				+{
			
 
				+	/****************
			
 
				+	 *     TODO     *
			
 
				+	 ****************/
			
 
				+	STARPU_ASSERT(0);
			
 
				+}
			
 
				+
			
 
				+
			
 
				 /* arg -> [Function pointer on sink, number of interfaces, interfaces
			
 
				  * (union _starpu_interface), cl_arg]
			
 
				  *
			
@@ -124,3 +151,10 @@ void _starpu_scc_sink_execute(const struct _starpu_mp_node *node, void *arg, int
 
				 
			
 
				 	_starpu_sink_common_execute(node, arg, arg_size);
			
 
				 }
			
 
				+
			
 
				+void (*_starpu_scc_sink_lookup (const struct _starpu_mp_node * node STARPU_ATTRIBUTE_UNUSED, char* func_name))(void)
			
 
				+{
			
 
				+	void *dl_handle = dlopen(NULL, RTLD_NOW);
			
 
				+	return dlsym(dl_handle, func_name);
			
 
				+}
			
 
				+
			
--- a/src/drivers/scc/driver_scc_sink.h
+++ b/src/drivers/scc/driver_scc_sink.h
@@ -25,13 +25,18 @@
 
				 #include <drivers/mp_common/mp_common.h>
			
 
				 
			
 
				 void _starpu_scc_sink_init(struct _starpu_mp_node *node);
			
 
				+void _starpu_scc_sink_launch_workers(struct _starpu_mp_node *node);
			
 
				 void _starpu_scc_sink_deinit(struct _starpu_mp_node *node);
			
 
				 
			
 
				 void _starpu_scc_sink_send_to_device(const struct _starpu_mp_node *node, int dst_devid, void *msg, int len);
			
 
				 void _starpu_scc_sink_recv_from_device(const struct _starpu_mp_node *node, int src_devid, void *msg, int len);
			
 
				 
			
 
				+void _starpu_scc_sink_bind_thread(const struct _starpu_mp_node *mp_node STARPU_ATTRIBUTE_UNUSED, cpu_set_t * cpuset, int coreid, pthread_t *thread);
			
 
				+
			
 
				 void _starpu_scc_sink_execute(const struct _starpu_mp_node *node, void *arg, int arg_size);
			
 
				 
			
 
				+void (*_starpu_scc_sink_lookup (const struct _starpu_mp_node * node STARPU_ATTRIBUTE_UNUSED, char* func_name))(void);
			
 
				+
			
 
				 #endif /* STARPU_USE_SCC */
			
 
				 
			
 
				 
			
--- a/src/drivers/scc/driver_scc_source.c
+++ b/src/drivers/scc/driver_scc_source.c
@@ -60,79 +60,40 @@ static void _starpu_scc_src_deinit_context(int devid)
 
				 
			
 
				 	_starpu_mp_common_node_destroy(scc_mp_nodes[devid]);
			
 
				 }
			
 
				-
			
 
				-static int _starpu_scc_src_execute_job(struct _starpu_job *j, struct _starpu_worker *args)
			
 
				+void (*_starpu_scc_src_get_kernel_from_job(const struct _starpu_mp_node *,struct _starpu_job *j))(void)
			
 
				 {
			
 
				-	int ret;
			
 
				-	uint32_t mask = 0;
			
 
				-
			
 
				-	STARPU_ASSERT(j);
			
 
				-	struct starpu_task *task = j->task;
			
 
				-
			
 
				-	struct timespec codelet_start, codelet_end;
			
 
				-
			
 
				-	int profiling = starpu_profiling_status_get();
			
 
				-	unsigned calibrate_model = 0;
			
 
				-
			
 
				-	STARPU_ASSERT(task);
			
 
				-	struct starpu_codelet *cl = task->cl;
			
 
				-	STARPU_ASSERT(cl);
			
 
				-
			
 
				-	if (cl->model && cl->model->benchmarking)
			
 
				-		calibrate_model = 1;
			
 
				-
			
 
				-	ret = _starpu_fetch_task_input(j, mask);
			
 
				-	if (ret != 0)
			
 
				+  starpu_scc_kernel_t kernel = NULL;
			
 
				+
			
 
				+  starpu_scc_func_t func = _starpu_task_get_scc_nth_implementation(j->task->cl, j->nimpl);
			
 
				+  if (func)
			
 
				+    {
			
 
				+      /* We execute the function contained in the codelet, it must return a
			
 
				+       * pointer to the function to execute on the device, either specified
			
 
				+       * directly by the user or by a call to starpu_scc_get_kernel().
			
 
				+       */
			
 
				+      kernel = func();
			
 
				+    }
			
 
				+  else
			
 
				+    {
			
 
				+      /* If user doesn't define any starpu_scc_func_t in cl->scc_funcs we try to use
			
 
				+       * cpu_funcs_name.
			
 
				+       */
			
 
				+      char *func_name = _starpu_task_get_cpu_name_nth_implementation(j->task->cl, j->nimpl);
			
 
				+      if (func_name)
			
 
				 	{
			
 
				-		/* there was not enough memory, so the input of
			
 
				-		 * the codelet cannot be fetched ... put the
			
 
				-		 * codelet back, and try it later */
			
 
				-		return -EAGAIN;
			
 
				-	}
			
 
				-
			
 
				+	  starpu_scc_func_symbol_t symbol;
			
 
				 
			
 
				-	starpu_scc_kernel_t kernel = NULL;
			
 
				+	  _starpu_scc_src_register_kernel(&symbol, func_name);
			
 
				 
			
 
				-	starpu_scc_func_t func = _starpu_task_get_scc_nth_implementation(j->task->cl, j->nimpl);
			
 
				-	if (func)
			
 
				-	{
			
 
				-		/* We execute the function contained in the codelet, it must return a
			
 
				-		 * pointer to the function to execute on the device, either specified
			
 
				-		 * directly by the user or by a call to starpu_scc_get_kernel().
			
 
				-		 */
			
 
				-		kernel = func();
			
 
				+	  kernel = _starpu_scc_src_get_kernel(symbol);
			
 
				 	}
			
 
				-	else
			
 
				-	{
			
 
				-		/* If user doesn't define any starpu_scc_func_t in cl->scc_funcs we try to use
			
 
				-		 * cpu_funcs_name.
			
 
				-		 */
			
 
				-		char *func_name = _starpu_task_get_cpu_name_nth_implementation(j->task->cl, j->nimpl);
			
 
				-		if (func_name)
			
 
				-		{
			
 
				-			starpu_scc_func_symbol_t symbol;
			
 
				-
			
 
				-			_starpu_scc_src_register_kernel(&symbol, func_name);
			
 
				-
			
 
				-			kernel = _starpu_scc_src_get_kernel(symbol);
			
 
				-		}
			
 
				-	}
			
 
				-	STARPU_ASSERT(kernel);
			
 
				-
			
 
				-	_starpu_driver_start_job(args, j, &codelet_start, 0, profiling);
			
 
				-
			
 
				-	_starpu_src_common_execute_kernel_from_task(scc_mp_nodes[args->devid], (void (*)(void)) kernel, 0, task);
			
 
				-
			
 
				-	_starpu_driver_end_job(args, j, args->perf_arch, &codelet_end, 0, profiling);
			
 
				-
			
 
				-	_starpu_driver_update_job_feedback(j, args, args->perf_arch, &codelet_start, &codelet_end, profiling);
			
 
				-
			
 
				-	_starpu_push_task_output(j, mask);
			
 
				+    }
			
 
				+  STARPU_ASSERT(kernel);  
			
 
				 
			
 
				-
			
 
				-	return 0;
			
 
				+  return (void (*)(void))kernel;
			
 
				 }
			
 
				 
			
 
				+
			
 
				 void _starpu_scc_src_mp_deinit()
			
 
				 {
			
 
				 	_starpu_scc_common_unmap_shared_memory();
			
@@ -320,7 +281,7 @@ int _starpu_scc_copy_sink_to_sink(void *src, unsigned src_node, void *dst, unsig
 
				 
			
 
				 void *_starpu_scc_src_worker(void *arg)
			
 
				 {
			
 
				-	struct _starpu_worker *args = arg;
			
 
				+	struct _starpu_worker_set *args = arg;
			
 
				 
			
 
				 	int devid = args->devid;
			
 
				 	int workerid = args->workerid;
			
@@ -350,64 +311,10 @@ void *_starpu_scc_src_worker(void *arg)
 
				 	STARPU_PTHREAD_COND_SIGNAL(&args->ready_cond);
			
 
				 	STARPU_PTHREAD_MUTEX_UNLOCK(&args->mutex);
			
 
				 
			
 
				-	struct _starpu_job * j;
			
 
				-	struct starpu_task *task;
			
 
				-	int res;
			
 
				-
			
 
				-	while (_starpu_machine_is_running())
			
 
				-	{
			
 
				-		_STARPU_TRACE_START_PROGRESS(memnode);
			
 
				-		_starpu_datawizard_progress(memnode, 1);
			
 
				-		_STARPU_TRACE_END_PROGRESS(memnode);
			
 
				-
			
 
				-		task = _starpu_get_worker_task(args, workerid, memnode);
			
 
				-		if (!task)
			
 
				-			continue;
			
 
				-
			
 
				-		j = _starpu_get_job_associated_to_task(task);
			
 
				-
			
 
				-		/* can a SCC device do that task ? */
			
 
				-		if (!_STARPU_SCC_MAY_PERFORM(j))
			
 
				-		{
			
 
				-			/* this isn't a SCC task */
			
 
				-			_starpu_push_task_to_workers(task);
			
 
				-			continue;
			
 
				-		}
			
 
				-
			
 
				-		_starpu_set_current_task(task);
			
 
				-		args->current_task = j->task;
			
 
				-
			
 
				-		res = _starpu_scc_src_execute_job(j, args);
			
 
				-
			
 
				-		_starpu_set_current_task(NULL);
			
 
				-		args->current_task = NULL;
			
 
				-
			
 
				-		if (res)
			
 
				-		{
			
 
				-			switch (res)
			
 
				-			{
			
 
				-				case -EAGAIN:
			
 
				-					_STARPU_DISP("ouch, SCC could not actually run task %p, putting it back...\n", task);
			
 
				-					_starpu_push_task_to_workers(task);
			
 
				-					STARPU_ABORT();
			
 
				-					continue;
			
 
				-				default:
			
 
				-					STARPU_ASSERT(0);
			
 
				-			}
			
 
				-		}
			
 
				-
			
 
				-		_starpu_handle_job_termination(j);
			
 
				-	}
			
 
				+	_starpu_src_common_worker(args, baseworkerid, scc_mp_nodes[mp_nodeid]);
			
 
				 
			
 
				 	_STARPU_TRACE_WORKER_DEINIT_START;
			
 
				 
			
 
				-	_starpu_handle_all_pending_node_data_requests(memnode);
			
 
				-
			
 
				-	/* In case there remains some memory that was automatically
			
 
				-	 * allocated by StarPU, we release it now. Note that data
			
 
				-	 * coherency is not maintained anymore at that point ! */
			
 
				-	_starpu_free_all_automatically_allocated_buffers(memnode);
			
 
				-
			
 
				 	_starpu_scc_src_deinit_context(args->devid);
			
 
				 
			
 
				 	_STARPU_TRACE_WORKER_DEINIT_END(_STARPU_FUT_SCC_KEY);
			
--- a/src/drivers/scc/driver_scc_source.h
+++ b/src/drivers/scc/driver_scc_source.h
@@ -29,6 +29,7 @@
 
				 
			
 
				 void _starpu_scc_src_mp_deinit();
			
 
				 
			
 
				+void (*_starpu_scc_src_get_kernel_from_job(const struct _starpu_mp_node *,struct _starpu_job *j))(void);
			
 
				 int _starpu_scc_src_register_kernel(starpu_scc_func_symbol_t *symbol, const char *func_name);
			
 
				 starpu_scc_kernel_t _starpu_scc_src_get_kernel(starpu_scc_func_symbol_t symbol);
			
 
				 
			
--- a/src/sched_policies/parallel_eager.c
+++ b/src/sched_policies/parallel_eager.c
@@ -29,8 +29,8 @@ struct _starpu_peager_data
 
				         starpu_pthread_mutex_t policy_mutex;
			
 
				 };
			
 
				 
			
 
				-#define STARPU_NMAXCOMBINED_WORKERS 10
			
 
				-/* XXX instead of 10, we should use some "MAX combination .."*/
			
 
				+#define STARPU_NMAXCOMBINED_WORKERS 520 
			
 
				+/* instead of STARPU_NMAXCOMBINED_WORKERS, we should use some "MAX combination .."*/
			
 
				 static int possible_combinations_cnt[STARPU_NMAXWORKERS];
			
 
				 static int possible_combinations[STARPU_NMAXWORKERS][STARPU_NMAXCOMBINED_WORKERS];
			
 
				 static int possible_combinations_size[STARPU_NMAXWORKERS][STARPU_NMAXCOMBINED_WORKERS];
			
@@ -42,15 +42,11 @@ static int possible_combinations_size[STARPU_NMAXWORKERS][STARPU_NMAXCOMBINED_WO
 
				 
			
 
				 static void peager_add_workers(unsigned sched_ctx_id, int *workerids, unsigned nworkers)
			
 
				 {
			
 
				+	_starpu_sched_find_worker_combinations(workerids, nworkers);
			
 
				 	struct _starpu_peager_data *data = (struct _starpu_peager_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
			
 
				 	unsigned nbasic_workers = starpu_worker_get_count();
			
 
				-		
			
 
				-	_starpu_sched_find_worker_combinations(workerids, nworkers);
			
 
				-
			
 
				+	unsigned ncombined_workers= starpu_combined_worker_get_count();
			
 
				 	unsigned workerid, i;
			
 
				-	unsigned ncombinedworkers;
			
 
				-
			
 
				-	ncombinedworkers = starpu_combined_worker_get_count();
			
 
				 
			
 
				 	/* Find the master of each worker. We first assign the worker as its
			
 
				 	 * own master, and then iterate over the different worker combinations
			
@@ -67,7 +63,7 @@ static void peager_add_workers(unsigned sched_ctx_id, int *workerids, unsigned n
 
				 	}
			
 
				 
			
 
				 
			
 
				-	for (i = 0; i < ncombinedworkers; i++)
			
 
				+	for (i = 0; i < ncombined_workers; i++)
			
 
				 	{
			
 
				 		workerid = nbasic_workers + i;
			
 
				 
			
@@ -77,20 +73,17 @@ static void peager_add_workers(unsigned sched_ctx_id, int *workerids, unsigned n
 
				 		starpu_combined_worker_get_description(workerid, &size, &workers);
			
 
				 
			
 
				 		int master = workers[0];
			
 
				-
			
 
				 		int j;
			
 
				 		for (j = 0; j < size; j++)
			
 
				 		{
			
 
				 			if (data->master_id[workers[j]] > master)
			
 
				 				data->master_id[workers[j]] = master;
			
 
				-
			
 
				 			int cnt = possible_combinations_cnt[workers[j]]++;
			
 
				 			possible_combinations[workers[j]][cnt] = workerid;
			
 
				 			possible_combinations_size[workers[j]][cnt] = size;
			
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-
			
 
				 	for(i = 0; i < nworkers; i++)
			
 
				 	{
			
 
				 		workerid = workerids[i];
			
@@ -176,9 +169,11 @@ static int push_task_peager_policy(struct starpu_task *task)
 
				 	{
			
 
				 		worker = workers->get_next(workers, &it);
			
 
				 		int master = data->master_id[worker];
			
 
				-		/* If this is not a CPU, then the worker simply grabs tasks from the fifo */
			
 
				-		if ((!starpu_worker_is_combined_worker(worker) && starpu_worker_get_type(worker) != STARPU_CPU_WORKER)
			
 
				-		    || (master == worker))
			
 
				+		/* If this is not a CPU or a MIC, then the worker simply grabs tasks from the fifo */
			
 
				+		if ((!starpu_worker_is_combined_worker(worker) && 
			
 
				+		    starpu_worker_get_type(worker) != STARPU_MIC_WORKER &&
			
 
				+		    starpu_worker_get_type(worker) != STARPU_CPU_WORKER)  
			
 
				+			|| (master == worker))
			
 
				 		{
			
 
				 			starpu_pthread_mutex_t *sched_mutex;
			
 
				 			starpu_pthread_cond_t *sched_cond;
			
@@ -198,8 +193,8 @@ static struct starpu_task *pop_task_peager_policy(unsigned sched_ctx_id)
 
				 
			
 
				 	int workerid = starpu_worker_get_id();
			
 
				 
			
 
				-	/* If this is not a CPU, then the worker simply grabs tasks from the fifo */
			
 
				-	if (starpu_worker_get_type(workerid) != STARPU_CPU_WORKER)
			
 
				+	/* If this is not a CPU or a MIC, then the worker simply grabs tasks from the fifo */
			
 
				+	if (starpu_worker_get_type(workerid) != STARPU_CPU_WORKER && starpu_worker_get_type(workerid) != STARPU_MIC_WORKER)
			
 
				 	{
			
 
				 		struct starpu_task *task = NULL;
			
 
				 		STARPU_PTHREAD_MUTEX_LOCK(&data->policy_mutex);
			
@@ -211,6 +206,9 @@ static struct starpu_task *pop_task_peager_policy(unsigned sched_ctx_id)
 
				 
			
 
				 	int master = data->master_id[workerid];
			
 
				 
			
 
				+	//_STARPU_DEBUG("workerid:%d, master:%d\n",workerid,master);
			
 
				+
			
 
				+
			
 
				 	if (master == workerid)
			
 
				 	{
			
 
				 		/* The worker is a master */
			
@@ -248,9 +246,9 @@ static struct starpu_task *pop_task_peager_policy(unsigned sched_ctx_id)
 
				 		/* Is this a basic worker or a combined worker ? */
			
 
				 		int nbasic_workers = (int)starpu_worker_get_count();
			
 
				 		int is_basic_worker = (best_workerid < nbasic_workers);
			
 
				-
			
 
				 		if (is_basic_worker)
			
 
				 		{
			
 
				+
			
 
				 			/* The master is alone */
			
 
				 			return task;
			
 
				 		}
			
--- a/src/util/starpu_data_cpy.c
+++ b/src/util/starpu_data_cpy.c
@@ -78,8 +78,13 @@ void mp_cpy_kernel(void *descr[], void *cl_arg)
 
				 	void *dst_interface = descr[0];
			
 
				 	void *src_interface = descr[1];
			
 
				 
			
 
				-	STARPU_ASSERT(copy_methods->ram_to_ram);
			
 
				-	copy_methods->ram_to_ram(src_interface, 0, dst_interface, 0);
			
 
				+	if(copy_methods->ram_to_ram)
			
 
				+		copy_methods->ram_to_ram(src_interface, STARPU_MAIN_RAM, dst_interface, STARPU_MAIN_RAM);
			
 
				+	else if(copy_methods->any_to_any)
			
 
				+		copy_methods->any_to_any(src_interface, STARPU_MAIN_RAM, dst_interface, STARPU_MAIN_RAM, NULL);
			
 
				+	else
			
 
				+		STARPU_ABORT();
			
 
				+
			
 
				 }
			
 
				 
			
 
				 static starpu_mic_kernel_t mic_cpy_func()
			
--- a/tests/datawizard/acquire_cb_insert.c
+++ b/tests/datawizard/acquire_cb_insert.c
@@ -77,6 +77,8 @@ int main(int argc, char **argv)
 
				 	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				+	if(starpu_cpu_worker_get_count() == 0) return STARPU_TEST_SKIPPED;
			
 
				+
			
 
				 	/* Declare x */
			
 
				 	starpu_variable_data_register(&x_handle, STARPU_MAIN_RAM, (uintptr_t)&x, sizeof(x));
			
 
				 
			
--- a/tests/datawizard/commute.c
+++ b/tests/datawizard/commute.c
@@ -85,7 +85,9 @@ void end(void *descr[], void *_args STARPU_ATTRIBUTE_UNUSED)
 
				 {
			
 
				 	int *x = (int *)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				 
			
 
				-	if (codelet_end.modes[0] & STARPU_W)
			
 
				+	enum starpu_data_access_mode end_mode = *(enum starpu_data_access_mode*) _args;
			
 
				+
			
 
				+	if (end_mode & STARPU_W)
			
 
				 		(*x)++;
			
 
				 }
			
 
				 
			
@@ -105,7 +107,7 @@ static void test(enum starpu_data_access_mode begin_mode, enum starpu_data_acces
 
				 	int ret;
			
 
				 
			
 
				 	codelet_begin.modes[0] = begin_mode;
			
 
				-	codelet_end.modes[0] = end_mode;
			
 
				+	codelet_end.modes[0] = end_mode;	
			
 
				 
			
 
				 	begin_t = starpu_task_create();
			
 
				 	begin_t->cl = &codelet_begin;
			
@@ -130,6 +132,8 @@ static void test(enum starpu_data_access_mode begin_mode, enum starpu_data_acces
 
				 	end_t->cl = &codelet_end;
			
 
				 	end_t->handles[0] = x_handle;
			
 
				 	end_t->detach = 0;
			
 
				+	end_t->cl_arg = &end_mode;
			
 
				+	end_t->cl_arg_size = sizeof(end_mode);
			
 
				 
			
 
				 	if (starpu_task_submit(begin_t) == -ENODEV)
			
 
				 		exit(STARPU_TEST_SKIPPED);
			
--- a/tests/datawizard/data_invalidation.c
+++ b/tests/datawizard/data_invalidation.c
@@ -141,6 +141,8 @@ int main(int argc, char **argv)
 
				 	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
			
 
				 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				 
			
 
				+	if(starpu_cpu_worker_get_count() == 0) return STARPU_TEST_SKIPPED;
			
 
				+
			
 
				 	/* The buffer should never be explicitely allocated */
			
 
				 	starpu_vector_data_register(&v_handle, (uint32_t)-1, (uintptr_t)NULL, VECTORSIZE, sizeof(char));
			
 
				 
			
--- a/tests/datawizard/interfaces/coo/coo_interface.c
+++ b/tests/datawizard/interfaces/coo/coo_interface.c
@@ -21,7 +21,7 @@
 
				 #define MATRIX_SIZE (NX*NY)
			
 
				 
			
 
				 #if defined(STARPU_USE_CPU) || defined(STAPRU_USE_MIC)
			
 
				-static void test_coo_cpu_func(void *buffers[], void *args);
			
 
				+void test_coo_cpu_func(void *buffers[], void *args);
			
 
				 #endif
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 extern void test_coo_cuda_func(void *buffers[], void *args);
			
--- a/tests/errorcheck/invalid_blocking_calls.c
+++ b/tests/errorcheck/invalid_blocking_calls.c
@@ -47,7 +47,6 @@ static struct starpu_codelet wrong_codelet =
 
				 	.cpu_funcs = {wrong_func, NULL},
			
 
				 	.cuda_funcs = {wrong_func, NULL},
			
 
				         .opencl_funcs = {wrong_func, NULL},
			
 
				-	.cpu_funcs_name = {"wrong_func", NULL},
			
 
				 	.model = NULL,
			
 
				 	.nbuffers = 0
			
 
				 };
			
--- a/tests/loader-cross.sh.in
+++ b/tests/loader-cross.sh.in
@@ -6,4 +6,4 @@ top_builddir="@top_builddir@"
 
				 NATIVE=${PWD/\/build_mic\//\/build_host\/}
			
 
				 DIR="$(dirname "$1")"
			
 
				 FILE="$(basename "$1")"
			
 
				-SINK_LD_LIBRARY_PATH="$top_builddir/src/.libs" STARPU_MIC_SINK_PROGRAM_NAME="$DIR/.libs/$FILE" $top_srcdir/build_host/tests/loader "$NATIVE/$1"
			
 
				+SINK_LD_LIBRARY_PATH="$top_builddir/src/.libs" STARPU_MIC_SINK_PROGRAM_NAME="$DIR/.libs/$FILE" $top_builddir/../build_host/tests/loader "$NATIVE/$1"
			
--- a/tests/main/starpu_init.c
+++ b/tests/main/starpu_init.c
@@ -83,6 +83,7 @@ int main(int argc, char **argv)
 
				 
			
 
				 	ret = check_cpu(-1, -1, -1, &cpu_init);
			
 
				 	if (ret) return ret;
			
 
				+	if (cpu_init == 0) return STARPU_TEST_SKIPPED;
			
 
				 
			
 
				 	if (cpu_init >= STARPU_MAXCPUS-5)
			
 
				 	{
			
--- a/tests/parallel_tasks/explicit_combined_worker.c
+++ b/tests/parallel_tasks/explicit_combined_worker.c
@@ -24,7 +24,7 @@
 
				 #define N	1000
			
 
				 #define VECTORSIZE	1024
			
 
				 
			
 
				-static void codelet_null(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
			
 
				+void codelet_null(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
			
 
				 {
			
 
				 	STARPU_SKIP_IF_VALGRIND;
			
 
				 
			
@@ -43,6 +43,7 @@ static struct starpu_codelet cl =
 
				 	.type = STARPU_FORKJOIN,
			
 
				 	.max_parallelism = INT_MAX,
			
 
				 	.cpu_funcs = {codelet_null, NULL},
			
 
				+	.cpu_funcs_name = {"codelet_null", NULL},
			
 
				 	.cuda_funcs = {codelet_null, NULL},
			
 
				         .opencl_funcs = {codelet_null, NULL},
			
 
				 	.nbuffers = 1,
			
--- a/tests/parallel_tasks/parallel_kernels.c
+++ b/tests/parallel_tasks/parallel_kernels.c
@@ -24,7 +24,7 @@
 
				 #define N	1000
			
 
				 #define VECTORSIZE	1024
			
 
				 
			
 
				-static void codelet_null(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
			
 
				+void codelet_null(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
			
 
				 {
			
 
				 	STARPU_SKIP_IF_VALGRIND;
			
 
				 
			
@@ -50,6 +50,7 @@ static struct starpu_codelet cl =
 
				 	.max_parallelism = INT_MAX,
			
 
				 	.cpu_funcs = {codelet_null, NULL},
			
 
				 	.cuda_funcs = {codelet_null, NULL},
			
 
				+	.cpu_funcs_name = {"codelet_null", NULL},
			
 
				         .opencl_funcs = {codelet_null, NULL},
			
 
				 	.model = &model,
			
 
				 	.nbuffers = 1,
			
--- a/tests/parallel_tasks/parallel_kernels_spmd.c
+++ b/tests/parallel_tasks/parallel_kernels_spmd.c
@@ -24,7 +24,7 @@
 
				 #define N	1000
			
 
				 #define VECTORSIZE	1024
			
 
				 
			
 
				-static void codelet_null(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
			
 
				+void codelet_null(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
			
 
				 {
			
 
				 	STARPU_SKIP_IF_VALGRIND;
			
 
				 
			
@@ -51,6 +51,7 @@ static struct starpu_codelet cl =
 
				 	.type = STARPU_SPMD,
			
 
				 	.max_parallelism = INT_MAX,
			
 
				 	.cpu_funcs = {codelet_null, NULL},
			
 
				+	.cpu_funcs_name = {"codelet_null", NULL},
			
 
				 	.cuda_funcs = {codelet_null, NULL},
			
 
				         .opencl_funcs = {codelet_null, NULL},
			
 
				 	.model = &model,
			
--- a/tests/parallel_tasks/spmd_peager.c
+++ b/tests/parallel_tasks/spmd_peager.c
@@ -26,7 +26,7 @@
 
				 starpu_data_handle_t v_handle;
			
 
				 static unsigned *v;
			
 
				 
			
 
				-static void codelet_null(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
			
 
				+void codelet_null(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
			
 
				 {
			
 
				 	STARPU_SKIP_IF_VALGRIND;
			
 
				 
			
@@ -49,6 +49,7 @@ static struct starpu_codelet cl =
 
				 	.type = STARPU_SPMD,
			
 
				 	.max_parallelism = INT_MAX,
			
 
				 	.cpu_funcs = {codelet_null, NULL},
			
 
				+	.cpu_funcs_name = {"codelet_null", NULL},
			
 
				 	.cuda_funcs = {codelet_null, NULL},
			
 
				         .opencl_funcs = {codelet_null, NULL},
			
 
				 	.nbuffers = 1,
			
--- a/tests/sched_policies/execute_all_tasks.c
+++ b/tests/sched_policies/execute_all_tasks.c
@@ -26,7 +26,7 @@
 
				 
			
 
				 #define NTASKS           8
			
 
				 
			
 
				-static void
			
 
				+void
			
 
				 dummy(void *buffers[], void *args)
			
 
				 {
			
 
				 	(void) buffers;
			
@@ -50,6 +50,7 @@ run(struct starpu_sched_policy *p)
 
				 	struct starpu_codelet cl =
			
 
				 	{
			
 
				 		.cpu_funcs    = {dummy, NULL},
			
 
				+		.cpu_funcs_name = {"dummy", NULL},
			
 
				 		.cuda_funcs   = {dummy, NULL},
			
 
				 		.opencl_funcs = {dummy, NULL},
			
 
				 		.nbuffers     = 0