Prechádzať zdrojové kódy

merge branches/mic in trunk

Thibaud Lambert 12 rokov pred
rodič
commit
67a384dfbb
62 zmenil súbory, kde vykonal 1798 pridanie a 2058 odobranie
  1. 15 12
      configure.ac
  2. 1 1
      include/starpu_perfmodel.h
  3. 15 4
      mic-configure
  4. 2 58
      mpi/examples/Makefile.am
  5. 0 19
      mpi/examples/mpi_lu/pdlu_implicit.c
  6. 0 6
      mpi/examples/mpi_lu/plu_example.c
  7. 0 357
      mpi/examples/mpi_lu/plu_implicit_example.c
  8. 0 19
      mpi/examples/mpi_lu/plu_implicit_example_double.c
  9. 0 19
      mpi/examples/mpi_lu/plu_implicit_example_float.c
  10. 0 381
      mpi/examples/mpi_lu/plu_outofcore_example.c
  11. 0 19
      mpi/examples/mpi_lu/plu_outofcore_example_double.c
  12. 0 19
      mpi/examples/mpi_lu/plu_outofcore_example_float.c
  13. 0 19
      mpi/examples/mpi_lu/pslu_implicit.c
  14. 0 162
      mpi/examples/mpi_lu/pxlu_implicit.c
  15. 1 0
      mpi/src/starpu_mpi_collective.c
  16. 0 3
      mpi/tests/Makefile.am
  17. 41 159
      mpi/tests/mpi_earlyrecv2.c
  18. 4 0
      src/common/barrier_counter.h
  19. 21 3
      src/core/combined_workers.c
  20. 86 25
      src/core/detect_combined_workers.c
  21. 4 1
      src/core/jobs.c
  22. 0 1
      src/core/sched_policy.c
  23. 6 2
      src/core/topology.c
  24. 9 3
      src/core/workers.c
  25. 4 0
      src/core/workers.h
  26. 2 1
      src/datawizard/data_request.c
  27. 0 2
      src/datawizard/filters.c
  28. 9 11
      src/datawizard/reduction.c
  29. 88 16
      src/drivers/driver_common/driver_common.c
  30. 1 1
      src/drivers/driver_common/driver_common.h
  31. 18 4
      src/drivers/mic/driver_mic_common.c
  32. 2 0
      src/drivers/mic/driver_mic_common.h
  33. 88 14
      src/drivers/mic/driver_mic_sink.c
  34. 5 3
      src/drivers/mic/driver_mic_sink.h
  35. 50 215
      src/drivers/mic/driver_mic_source.c
  36. 3 1
      src/drivers/mic/driver_mic_source.h
  37. 155 101
      src/drivers/mp_common/mp_common.c
  38. 88 29
      src/drivers/mp_common/mp_common.h
  39. 487 142
      src/drivers/mp_common/sink_common.c
  40. 9 2
      src/drivers/mp_common/sink_common.h
  41. 432 64
      src/drivers/mp_common/source_common.c
  42. 28 9
      src/drivers/mp_common/source_common.h
  43. 8 0
      src/drivers/scc/driver_scc_common.c
  44. 2 0
      src/drivers/scc/driver_scc_common.h
  45. 34 0
      src/drivers/scc/driver_scc_sink.c
  46. 5 0
      src/drivers/scc/driver_scc_sink.h
  47. 28 121
      src/drivers/scc/driver_scc_source.c
  48. 1 0
      src/drivers/scc/driver_scc_source.h
  49. 16 18
      src/sched_policies/parallel_eager.c
  50. 7 2
      src/util/starpu_data_cpy.c
  51. 2 0
      tests/datawizard/acquire_cb_insert.c
  52. 6 2
      tests/datawizard/commute.c
  53. 2 0
      tests/datawizard/data_invalidation.c
  54. 1 1
      tests/datawizard/interfaces/coo/coo_interface.c
  55. 0 1
      tests/errorcheck/invalid_blocking_calls.c
  56. 1 1
      tests/loader-cross.sh.in
  57. 1 0
      tests/main/starpu_init.c
  58. 2 1
      tests/parallel_tasks/explicit_combined_worker.c
  59. 2 1
      tests/parallel_tasks/parallel_kernels.c
  60. 2 1
      tests/parallel_tasks/parallel_kernels_spmd.c
  61. 2 1
      tests/parallel_tasks/spmd_peager.c
  62. 2 1
      tests/sched_policies/execute_all_tasks.c

+ 15 - 12
configure.ac

@@ -344,11 +344,6 @@ if test x$enable_cpu = xyes; then
 	AC_DEFINE(STARPU_USE_CPU, [1], [CPU driver is activated])
 fi
 
-# How many parallel worker can we support ?
-nmaxcombinedworkers=`expr 2 \* $maxcpus`
-AC_DEFINE_UNQUOTED(STARPU_NMAX_COMBINEDWORKERS,
-	[$nmaxcombinedworkers], [Maximum number of worker combinations])
-
 ###############################################################################
 #                                                                             #
 #                                 CUDA settings                               #
@@ -969,7 +964,7 @@ AC_DEFINE_UNQUOTED(STARPU_MAXMICDEVS, [$nmaxmicdev],
 AC_MSG_CHECKING(maximum number of MIC threads)
 AC_ARG_ENABLE(maxmicthreads, [AS_HELP_STRING([--enable-maxmicthreads=<number>],
 			[maximum number of MIC threads])],
-			nmaxmicthreads=$enableval, nmaxmicthreads=128)
+			nmaxmicthreads=$enableval, nmaxmicthreads=940)
 AC_MSG_RESULT($nmaxmicthread)
 
 AC_DEFINE_UNQUOTED(STARPU_MAXMICCORES, [$nmaxmicthreads],
@@ -998,7 +993,6 @@ AC_DEFUN([STARPU_CHECK_COI_RUNTIME],
     __coi_dir=$1
     __coi_include_dir=$2
     __coi_lib_dir=$3
-    __coi_lib_name=$4
 
     if test "$__coi_dir" != "no" -a "$__coi_dir" != "" ; then
 	AC_MSG_CHECKING(whether MIC's COI runtime is available in $__coi_dir)
@@ -1027,14 +1021,14 @@ AC_DEFUN([STARPU_CHECK_COI_RUNTIME],
     AC_CHECK_HEADER([source/COIEngine_source.h],[have_valid_coi=yes],[have_valid_coi=no])
 
     if test "$have_valid_coi" = "yes" ; then
-	AC_HAVE_LIBRARY([$__coi_lib_name],[have_valid_coi=yes],[have_valid_coi=no])
+	AC_HAVE_LIBRARY([$4],[have_valid_coi=yes],[have_valid_coi=no])
 
         if test "$have_valid_coi" = "no" ; then
             if test "$3" = "no" -a "$__coi_dir" != "no" ; then
 		# ${__coi_dir}/lib didn't work, let's try with lib64
                 __coi_lib_dir="$__coi_dir/lib64"
 		LDFLAGS="${SAVED_LDFLAGS} -L$__coi_lib_dir"
-	        AC_HAVE_LIBRARY([$__coi_lib_name],[have_valid_coi=yes],[have_valid_coi=no])
+	        AC_HAVE_LIBRARY([$4],[have_valid_coi=yes],[have_valid_coi=no])
             fi
         fi
     fi
@@ -1044,7 +1038,7 @@ AC_DEFUN([STARPU_CHECK_COI_RUNTIME],
     fi
 
     if test "$have_valid_coi" = "yes" -a "$__coi_lib_dir" != "no"; then
-        STARPU_COI_LDFLAGS="-L$__coi_lib_dir -l$__coi_lib_name"
+        STARPU_COI_LDFLAGS="-L$__coi_lib_dir -l$4"
     fi
 
     CPPFLAGS="${SAVED_CPPFLAGS}"
@@ -1053,12 +1047,12 @@ AC_DEFUN([STARPU_CHECK_COI_RUNTIME],
 
 if test x$enable_mic = xyes ; then
 
-    STARPU_CHECK_COI_RUNTIME($coi_dir, $coi_include_dir, $coi_lib_dir, "coi_host")
+    STARPU_CHECK_COI_RUNTIME($coi_dir, $coi_include_dir, $coi_lib_dir, coi_host)
 
     # Host runtime is not compatible, we are probably cross-compiling
     # Let's have a look for the device runtime which lib has a different name
     if test "$have_valid_coi" = "no" ; then
-	    STARPU_CHECK_COI_RUNTIME($coi_dir, $coi_include_dir, $coi_lib_dir, "coi_device")
+	    STARPU_CHECK_COI_RUNTIME($coi_dir, $coi_include_dir, $coi_lib_dir, coi_device)
     fi
 
     if test "$have_valid_coi" = "no" ; then
@@ -1478,6 +1472,15 @@ AC_MSG_CHECKING(Maximum number of workers)
 AC_MSG_RESULT($nmaxworkers)
 AC_DEFINE_UNQUOTED(STARPU_NMAXWORKERS, [$nmaxworkers], [Maximum number of workers])
 
+# Computes the maximun number of combined worker
+nmaxcombinedworkers=`expr $maxcpus + $nmaxmicthreads`  
+AC_MSG_CHECKING(Maximum number of workers combinations)
+AC_MSG_RESULT($nmaxcombinedworkers)
+AC_DEFINE_UNQUOTED(STARPU_NMAX_COMBINEDWORKERS,
+	[$nmaxcombinedworkers], [Maximum number of worker combinations])
+
+
+
 # Computes the maximum number of implementations per arch
 AC_MSG_CHECKING(maximum number of implementations)
 AC_ARG_ENABLE(maximplementations, [AS_HELP_STRING([--enable-maximplementations=<number>],

+ 1 - 1
include/starpu_perfmodel.h

@@ -38,7 +38,7 @@ enum starpu_perfmodel_archtype
 	STARPU_CUDA_DEFAULT = STARPU_MAXCPUS,
 	STARPU_OPENCL_DEFAULT = STARPU_CUDA_DEFAULT + STARPU_MAXCUDADEVS,
 	STARPU_MIC_DEFAULT = STARPU_OPENCL_DEFAULT + STARPU_MAXOPENCLDEVS,
-	STARPU_SCC_DEFAULT = STARPU_MIC_DEFAULT + STARPU_MAXMICDEVS
+	STARPU_SCC_DEFAULT = STARPU_MIC_DEFAULT + STARPU_MAXMICDEVS //* STARPU_MAXMICCPUS
 };
 
 #ifdef __STDC_VERSION__

+ 15 - 4
mic-configure

@@ -1,6 +1,6 @@
 #!/bin/bash
 
-ROOT_DIR=$PWD
+ROOT_DIR=$(dirname $0)
 
 cat > ./mic-config.log << EOF
 This file was created by StarPU mic-configure
@@ -28,12 +28,23 @@ do
 
 done
 
+x=$(type -t ${mic_host}-gcc)
+if [ -z "$x" ]
+then
+    echo "[error] please add path to ${mic_host}-gcc in your PATH"
+    exit 1
+fi
+
 for arch in mic host
 do
 	# We call the configure script from a build directory further in the
 	# arborescence
 
-	command="${ROOT_DIR}/configure"
+	case $ROOT_DIR in
+		/*) command="${ROOT_DIR}/configure";;
+		*) command="../${ROOT_DIR}/configure";;
+	esac
+
 	params="--enable-mic --with-coi-dir=$coi_dir --prefix=$prefix/$arch"
 
 	if test x$arch = xmic ; then
@@ -44,7 +55,7 @@ do
 	fi
 
 	# If the build directory doesn't exist yet, create it
-	if [ ! -d "${ROOT_DIR}/build_${arch}" ] ; then
+	if [ ! -d "build_${arch}" ] ; then
 		mkdir "build_${arch}"
 	fi
 
@@ -59,7 +70,7 @@ do
 	then
 		exit $?
 	fi
-	cd "${ROOT_DIR}"
+	cd ..
 done
 
 cat > Makefile << EOF

+ 2 - 58
mpi/examples/Makefile.am

@@ -44,16 +44,13 @@ BUILT_SOURCES =
 
 CLEANFILES = *.gcno *.gcda *.linkinfo
 
-EXTRA_DIST = 				\
+EXTRA_DIST = 					\
 	mpi_lu/mpi_lu-float.h		\
 	mpi_lu/mpi_lu-double.h		\
 	mpi_lu/plu_example.c		\
-	mpi_lu/plu_implicit_example.c	\
-	mpi_lu/plu_outofcore_example.c	\
 	mpi_lu/plu_solve.c		\
 	mpi_lu/pxlu.h			\
 	mpi_lu/pxlu.c			\
-	mpi_lu/pxlu_implicit.c		\
 	mpi_lu/pxlu_kernels.h		\
 	mpi_lu/pxlu_kernels.c		\
 	matrix_decomposition/mpi_cholesky_codelets.h 	\
@@ -104,11 +101,7 @@ if !NO_BLAS_LIB
 
 examplebin_PROGRAMS += 			\
 	mpi_lu/plu_example_float	\
-	mpi_lu/plu_example_double	\
-	mpi_lu/plu_implicit_example_float	\
-	mpi_lu/plu_implicit_example_double	\
-	mpi_lu/plu_outofcore_example_float	\
-	mpi_lu/plu_outofcore_example_double
+	mpi_lu/plu_example_double
 
 mpi_lu_plu_example_float_LDADD =	\
 	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
@@ -133,57 +126,8 @@ mpi_lu_plu_example_double_SOURCES =	\
 	mpi_lu/pdlu_kernels.c	    	\
 	mpi_lu/pdlu.c		    	\
 	$(top_srcdir)/examples/common/blas.c
-
-mpi_lu_plu_implicit_example_float_LDADD =	\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
-	$(STARPU_LIBNUMA_LDFLAGS)				\
-	$(STARPU_BLAS_LDFLAGS) -lm
-
-mpi_lu_plu_implicit_example_float_SOURCES =	\
-	mpi_lu/plu_implicit_example_float.c	\
-	mpi_lu/plu_solve_float.c		\
-	mpi_lu/pslu_kernels.c			\
-	mpi_lu/pslu_implicit.c			\
-	$(top_srcdir)/examples/common/blas.c
-
-mpi_lu_plu_implicit_example_double_LDADD =	\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
-	$(STARPU_LIBNUMA_LDFLAGS)				\
-	$(STARPU_BLAS_LDFLAGS) -lm
-
-mpi_lu_plu_implicit_example_double_SOURCES =	\
-	mpi_lu/plu_outofcore_example_double.c	\
-	mpi_lu/plu_solve_double.c		\
-	mpi_lu/pdlu_kernels.c			\
-	mpi_lu/pdlu_implicit.c			\
-	$(top_srcdir)/examples/common/blas.c
-
-mpi_lu_plu_outofcore_example_float_LDADD =	\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
-	$(STARPU_LIBNUMA_LDFLAGS)				\
-	$(STARPU_BLAS_LDFLAGS) -lm
-
-mpi_lu_plu_outofcore_example_float_SOURCES =	\
-	mpi_lu/plu_outofcore_example_float.c	\
-	mpi_lu/plu_solve_float.c		\
-	mpi_lu/pslu_kernels.c			\
-	mpi_lu/pslu_implicit.c			\
-	$(top_srcdir)/examples/common/blas.c
-
-mpi_lu_plu_outofcore_example_double_LDADD =	\
-	../src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la	\
-	$(STARPU_LIBNUMA_LDFLAGS)				\
-	$(STARPU_BLAS_LDFLAGS) -lm
-
-mpi_lu_plu_outofcore_example_double_SOURCES =	\
-	mpi_lu/plu_outofcore_example_double.c	\
-	mpi_lu/plu_solve_double.c		\
-	mpi_lu/pdlu_kernels.c			\
-	mpi_lu/pdlu_implicit.c			\
-	$(top_srcdir)/examples/common/blas.c
 endif
 
-
 ########################
 # MPI Cholesky example #
 ########################

+ 0 - 19
mpi/examples/mpi_lu/pdlu_implicit.c

@@ -1,19 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2010, 2013  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-#include "mpi_lu-double.h"
-#include "pxlu_implicit.c"

+ 0 - 6
mpi/examples/mpi_lu/plu_example.c

@@ -109,12 +109,6 @@ static void parse_args(int rank, int argc, char **argv)
 			char *argptr;
 			q = strtol(argv[++i], &argptr, 10);
 		}
-
-		if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "-help") == 0 || strcmp(argv[i], "--help") == 0) {
-			fprintf(stderr,"usage: %s [-size n] [-nblocks b] [-check] [-display] [-numa] [-p p] [-q q]\n", argv[0]);
-			fprintf(stderr,"\np * q must be equal to the number of MPI nodes\n");
-			exit(0);
-		}
 	}
 }
 

+ 0 - 357
mpi/examples/mpi_lu/plu_implicit_example.c

@@ -1,357 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2010-2011, 2013  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <time.h>
-#include <math.h>
-#include <starpu.h>
-
-#include "pxlu.h"
-//#include "pxlu_kernels.h"
-
-#ifdef STARPU_HAVE_LIBNUMA
-#include <numaif.h>
-#endif
-
-static unsigned long size = 4096;
-static unsigned nblocks = 16;
-static unsigned check = 0;
-static int p = 1;
-static int q = 1;
-static unsigned display = 0;
-
-#ifdef STARPU_HAVE_LIBNUMA
-static unsigned numa = 0;
-#endif
-
-static size_t allocated_memory = 0;
-static size_t allocated_memory_extra = 0;
-
-static starpu_data_handle_t *dataA_handles;
-static TYPE **dataA;
-
-int get_block_rank(unsigned i, unsigned j);
-
-static void parse_args(int argc, char **argv)
-{
-	int i;
-	for (i = 1; i < argc; i++) {
-		if (strcmp(argv[i], "-size") == 0) {
-			char *argptr;
-			size = strtol(argv[++i], &argptr, 10);
-		}
-
-		if (strcmp(argv[i], "-nblocks") == 0) {
-			char *argptr;
-			nblocks = strtol(argv[++i], &argptr, 10);
-		}
-
-		if (strcmp(argv[i], "-check") == 0) {
-			check = 1;
-		}
-
-		if (strcmp(argv[i], "-display") == 0) {
-			display = 1;
-		}
-
-		if (strcmp(argv[i], "-numa") == 0) {
-#ifdef STARPU_HAVE_LIBNUMA
-			numa = 1;
-#else
-			if (rank == 0)
-				fprintf(stderr, "Warning: libnuma is not available\n");
-#endif
-		}
-
-		if (strcmp(argv[i], "-p") == 0) {
-			char *argptr;
-			p = strtol(argv[++i], &argptr, 10);
-		}
-
-		if (strcmp(argv[i], "-q") == 0) {
-			char *argptr;
-			q = strtol(argv[++i], &argptr, 10);
-		}
-
-		if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "-help") == 0 || strcmp(argv[i], "--help") == 0) {
-			fprintf(stderr,"usage: %s [-size n] [-nblocks b] [-check] [-display] [-numa] [-p p] [-q q]\n", argv[0]);
-			fprintf(stderr,"\np * q must be equal to the number of MPI nodes\n");
-			exit(0);
-		}
-	}
-}
-
-unsigned STARPU_PLU(display_flag)(void)
-{
-	return display;
-}
-
-static void fill_block_with_random(TYPE *blockptr, unsigned psize, unsigned pnblocks)
-{
-	const unsigned block_size = (psize/pnblocks);
-
-	unsigned i, j;
-	for (i = 0; i < block_size; i++)
-	     for (j = 0; j < block_size; j++)
-	     {
-		  blockptr[j+i*block_size] = (TYPE)starpu_drand48();
-	     }
-}
-
-static void init_matrix(int rank)
-{
-#ifdef STARPU_HAVE_LIBNUMA
-	if (numa)
-	{
-		fprintf(stderr, "Using INTERLEAVE policy\n");
-		unsigned long nodemask = ((1<<0)|(1<<1));
-		int ret = set_mempolicy(MPOL_INTERLEAVE, &nodemask, 3);
-		if (ret)
-			perror("set_mempolicy failed");
-	}
-#endif
-
-	/* Allocate a grid of data handles, not all of them have to be allocated later on */
-	dataA_handles = calloc(nblocks*nblocks, sizeof(starpu_data_handle_t));
-	dataA = calloc(nblocks*nblocks, sizeof(TYPE *));
-	allocated_memory_extra += nblocks*nblocks*(sizeof(starpu_data_handle_t) + sizeof(TYPE *));
-
-	size_t blocksize = (size_t)(size/nblocks)*(size/nblocks)*sizeof(TYPE);
-
-	/* Allocate all the blocks that belong to this mpi node */
-	unsigned long i,j;
-	for (j = 0; j < nblocks; j++)
-	{
-		for (i = 0; i < nblocks; i++)
-		{
-			int block_rank = get_block_rank(i, j);
-			TYPE **blockptr = &dataA[j+i*nblocks];
-//			starpu_data_handle_t *handleptr = &dataA_handles[j+nblocks*i];
-			starpu_data_handle_t *handleptr = &dataA_handles[j+nblocks*i];
-
-			if (block_rank == rank)
-			{
-				/* This blocks should be treated by the current MPI process */
-				/* Allocate and fill it */
-				starpu_malloc((void **)blockptr, blocksize);
-				allocated_memory += blocksize;
-
-				//fprintf(stderr, "Rank %d : fill block (i = %d, j = %d)\n", rank, i, j);
-				fill_block_with_random(*blockptr, size, nblocks);
-				//fprintf(stderr, "Rank %d : fill block (i = %d, j = %d)\n", rank, i, j);
-				if (i == j)
-				{
-					unsigned tmp;
-					for (tmp = 0; tmp < size/nblocks; tmp++)
-					{
-						(*blockptr)[tmp*((size/nblocks)+1)] += (TYPE)10*nblocks;
-					}
-				}
-
-				/* Register it to StarPU */
-				starpu_matrix_data_register(handleptr, STARPU_MAIN_RAM,
-					(uintptr_t)*blockptr, size/nblocks,
-					size/nblocks, size/nblocks, sizeof(TYPE));
-			}
-			else {
-				starpu_matrix_data_register(handleptr, -1,
-					0, size/nblocks,
-					size/nblocks, size/nblocks, sizeof(TYPE));
-				*blockptr = STARPU_POISON_PTR;
-			}
-			starpu_data_set_rank(*handleptr, block_rank);
-			starpu_data_set_tag(*handleptr, j+i*nblocks);
-		}
-	}
-
-	//display_all_blocks(nblocks, size/nblocks);
-}
-
-TYPE *STARPU_PLU(get_block)(unsigned i, unsigned j)
-{
-	return dataA[j+i*nblocks];
-}
-
-int get_block_rank(unsigned i, unsigned j)
-{
-	/* Take a 2D block cyclic distribution */
-	/* NB: p (resp. q) is for "direction" i (resp. j) */
-	return (j % q) * p + (i % p);
-}
-
-starpu_data_handle_t STARPU_PLU(get_block_handle)(unsigned i, unsigned j)
-{
-	return dataA_handles[j+i*nblocks];
-}
-
-static void display_grid(int rank, unsigned pnblocks)
-{
-	if (!display)
-		return;
-
-	//if (rank == 0)
-	{
-		fprintf(stderr, "2D grid layout (Rank %d): \n", rank);
-
-		unsigned i, j;
-		for (j = 0; j < pnblocks; j++)
-		{
-			for (i = 0; i < pnblocks; i++)
-			{
-				TYPE *blockptr = STARPU_PLU(get_block)(i, j);
-				starpu_data_handle_t handle = STARPU_PLU(get_block_handle)(i, j);
-
-				fprintf(stderr, "%d (data %p handle %p)", get_block_rank(i, j), blockptr, handle);
-			}
-			fprintf(stderr, "\n");
-		}
-	}
-}
-
-int main(int argc, char **argv)
-{
-	int rank;
-	int world_size;
-
-	starpu_srand48((long int)time(NULL));
-
-	parse_args(argc, argv);
-
-	int ret = starpu_init(NULL);
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-
-	ret = starpu_mpi_init(&argc, &argv, 1);
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
-
-	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-	MPI_Comm_size(MPI_COMM_WORLD, &world_size);
-
-	STARPU_ASSERT(p*q == world_size);
-
-	starpu_cublas_init();
-
-	/*
-	 * 	Problem Init
-	 */
-
-	init_matrix(rank);
-
-	fprintf(stderr, "Rank %d: allocated (%d + %d) MB = %d MB\n", rank,
-                        (int)(allocated_memory/(1024*1024)),
-			(int)(allocated_memory_extra/(1024*1024)),
-                        (int)((allocated_memory+allocated_memory_extra)/(1024*1024)));
-
-	display_grid(rank, nblocks);
-
-	TYPE *a_r = NULL;
-//	STARPU_PLU(display_data_content)(a_r, size);
-
-	TYPE *x, *y;
-
-	if (check)
-	{
-		x = calloc(size, sizeof(TYPE));
-		STARPU_ASSERT(x);
-
-		y = calloc(size, sizeof(TYPE));
-		STARPU_ASSERT(y);
-
-		if (rank == 0)
-		{
-			unsigned ind;
-			for (ind = 0; ind < size; ind++)
-				x[ind] = (TYPE)starpu_drand48();
-		}
-
-		a_r = STARPU_PLU(reconstruct_matrix)(size, nblocks);
-
-		if (rank == 0)
-			STARPU_PLU(display_data_content)(a_r, size);
-
-//		STARPU_PLU(compute_ax)(size, x, y, nblocks, rank);
-	}
-
-	double timing = STARPU_PLU(plu_main)(nblocks, rank, world_size);
-
-	/*
-	 * 	Report performance
-	 */
-
-	if (rank == 0)
-	{
-		fprintf(stderr, "Computation took: %f ms\n", timing/1000);
-
-		unsigned n = size;
-		double flop = (2.0f*n*n*n)/3.0f;
-		fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
-	}
-
-	/*
-	 *	Test Result Correctness
-	 */
-
-	if (check)
-	{
-		/*
-		 *	Compute || A - LU ||
-		 */
-
-		STARPU_PLU(compute_lu_matrix)(size, nblocks, a_r);
-
-#if 0
-		/*
-		 *	Compute || Ax - LUx ||
-		 */
-
-		unsigned ind;
-
-		y2 = calloc(size, sizeof(TYPE));
-		STARPU_ASSERT(y);
-
-		if (rank == 0)
-		{
-			for (ind = 0; ind < size; ind++)
-			{
-				y2[ind] = (TYPE)0.0;
-			}
-		}
-
-		STARPU_PLU(compute_lux)(size, x, y2, nblocks, rank);
-
-		/* Compute y2 = y2 - y */
-		CPU_AXPY(size, -1.0, y, 1, y2, 1);
-
-		TYPE err = CPU_ASUM(size, y2, 1);
-		int max = CPU_IAMAX(size, y2, 1);
-
-		fprintf(stderr, "(A - LU)X Avg error : %e\n", err/(size*size));
-		fprintf(stderr, "(A - LU)X Max error : %e\n", y2[max]);
-#endif
-	}
-
-	/*
-	 * 	Termination
-	 */
-
-	starpu_cublas_shutdown();
-	starpu_mpi_shutdown();
-	starpu_shutdown();
-
-	return 0;
-}

+ 0 - 19
mpi/examples/mpi_lu/plu_implicit_example_double.c

@@ -1,19 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2010, 2013  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-#include "mpi_lu-double.h"
-#include "plu_implicit_example.c"

+ 0 - 19
mpi/examples/mpi_lu/plu_implicit_example_float.c

@@ -1,19 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2010, 2013  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-#include "mpi_lu-float.h"
-#include "plu_implicit_example.c"

+ 0 - 381
mpi/examples/mpi_lu/plu_outofcore_example.c

@@ -1,381 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2010-2011, 2013  Université de Bordeaux 1
- * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <time.h>
-#include <math.h>
-#include <starpu.h>
-#include <fcntl.h>
-#include <sys/stat.h>
-
-#include "pxlu.h"
-//#include "pxlu_kernels.h"
-
-#ifdef STARPU_HAVE_LIBNUMA
-#include <numaif.h>
-#endif
-
-static unsigned long size = 4096;
-static unsigned nblocks = 16;
-static unsigned check = 0;
-static int p = 1;
-static int q = 1;
-static unsigned display = 0;
-static char *path = "/tmp/starpu-mpi_LU";
-
-#ifdef STARPU_HAVE_LIBNUMA
-static unsigned numa = 0;
-#endif
-
-static size_t allocated_memory = 0;
-
-static starpu_data_handle_t *dataA_handles;
-
-int get_block_rank(unsigned i, unsigned j);
-
-static void parse_args(int argc, char **argv)
-{
-	int i;
-	for (i = 1; i < argc; i++) {
-		if (strcmp(argv[i], "-size") == 0) {
-			char *argptr;
-			size = strtol(argv[++i], &argptr, 10);
-		}
-
-		if (strcmp(argv[i], "-nblocks") == 0) {
-			char *argptr;
-			nblocks = strtol(argv[++i], &argptr, 10);
-		}
-
-		if (strcmp(argv[i], "-check") == 0) {
-			check = 1;
-		}
-
-		if (strcmp(argv[i], "-display") == 0) {
-			display = 1;
-		}
-
-		if (strcmp(argv[i], "-numa") == 0) {
-#ifdef STARPU_HAVE_LIBNUMA
-			numa = 1;
-#else
-			if (rank == 0)
-				fprintf(stderr, "Warning: libnuma is not available\n");
-#endif
-		}
-
-		if (strcmp(argv[i], "-p") == 0) {
-			char *argptr;
-			p = strtol(argv[++i], &argptr, 10);
-		}
-
-		if (strcmp(argv[i], "-q") == 0) {
-			char *argptr;
-			q = strtol(argv[++i], &argptr, 10);
-		}
-
-		if (strcmp(argv[i], "-path") == 0)
-			path = argv[++i];
-
-		if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "-help") == 0 || strcmp(argv[i], "--help") == 0) {
-			fprintf(stderr,"usage: %s [-size n] [-nblocks b] [-check] [-display] [-numa] [-p p] [-q q] [-path PATH]\n", argv[0]);
-			fprintf(stderr,"\np * q must be equal to the number of MPI nodes\n");
-			exit(0);
-		}
-	}
-}
-
-unsigned STARPU_PLU(display_flag)(void)
-{
-	return display;
-}
-
-static void fill_block_with_random(TYPE *blockptr, unsigned psize, unsigned pnblocks)
-{
-	const unsigned block_size = (psize/pnblocks);
-
-	unsigned i, j;
-	for (i = 0; i < block_size; i++)
-	     for (j = 0; j < block_size; j++)
-	     {
-		  blockptr[j+i*block_size] = (TYPE)starpu_drand48();
-	     }
-}
-
-static void create_matrix()
-{
-	size_t blocksize = (size_t)(size/nblocks)*(size/nblocks)*sizeof(TYPE);
-	TYPE *blockptr = malloc(blocksize);
-	int fd;
-	char *filename;
-	unsigned filename_length = strlen(path) + 1 + sizeof(nblocks)*3 + 1 + sizeof(nblocks)*3 + 1;
-
-	filename = malloc(filename_length);
-
-	allocated_memory += nblocks*nblocks*blocksize*sizeof(TYPE *);
-
-	/* Create the whole matrix on the disk */
-	unsigned i,j;
-	for (j = 0; j < nblocks; j++)
-	{
-		for (i = 0; i < nblocks; i++)
-		{
-			fill_block_with_random(blockptr, size, nblocks);
-			if (i == j)
-			{
-				unsigned tmp;
-				for (tmp = 0; tmp < size/nblocks; tmp++)
-				{
-					blockptr[tmp*((size/nblocks)+1)] += (TYPE)10*nblocks;
-				}
-			}
-			snprintf(filename, filename_length, "%s/%u,%u", path, i, j);
-			fd = open(filename, O_WRONLY|O_CREAT|O_TRUNC, 0777);
-			if (fd < 0) {
-				perror("open");
-				exit(1);
-			}
-			if (write(fd, blockptr, blocksize) != (ssize_t) blocksize) {
-				fprintf(stderr,"short write");
-				exit(1);
-			}
-			if (close(fd) < 0) {
-				perror("close");
-				exit(1);
-			}
-		}
-	}
-
-	free(blockptr);
-	free(filename);
-}
-
-static void init_matrix(int rank)
-{
-	/* Allocate a grid of data handles, not all of them have to be allocated later on */
-	dataA_handles = calloc(nblocks*nblocks, sizeof(starpu_data_handle_t));
-
-	size_t blocksize = (size_t)(size/nblocks)*(size/nblocks)*sizeof(TYPE);
-
-	int disk_node = starpu_disk_register(&starpu_disk_unistd_ops, path, STARPU_MAX(1024*1024, size*size*sizeof(TYPE)));
-
-	char filename[sizeof(nblocks)*3 + 1 + sizeof(nblocks)*3 + 1];
-
-	/* Allocate all the blocks that belong to this mpi node */
-	unsigned i,j;
-	for (j = 0; j < nblocks; j++)
-	{
-		for (i = 0; i < nblocks; i++)
-		{
-			int block_rank = get_block_rank(i, j);
-//			starpu_data_handle_t *handleptr = &dataA_handles[j+nblocks*i];
-			starpu_data_handle_t *handleptr = &dataA_handles[j+nblocks*i];
-
-			if (block_rank == rank)
-			{
-				void *disk_obj;
-				snprintf(filename, sizeof(filename), "%u,%u", i, j);
-				/* Register it to StarPU */
-				disk_obj = starpu_disk_open(disk_node, filename, blocksize);
-				if (!disk_obj) {
-					fprintf(stderr,"could not open %s\n", filename);
-					exit(1);
-				}
-				starpu_matrix_data_register(handleptr, disk_node,
-					(uintptr_t) disk_obj, size/nblocks,
-					size/nblocks, size/nblocks, sizeof(TYPE));
-			}
-			else {
-				starpu_matrix_data_register(handleptr, -1,
-					0, size/nblocks,
-					size/nblocks, size/nblocks, sizeof(TYPE));
-			}
-			starpu_data_set_rank(*handleptr, block_rank);
-			starpu_data_set_tag(*handleptr, j+i*nblocks);
-		}
-	}
-
-	//display_all_blocks(nblocks, size/nblocks);
-}
-
-TYPE *STARPU_PLU(get_block)(unsigned i, unsigned j)
-{
-	/* This does not really make sense in out of core */
-	assert(0);
-}
-
-int get_block_rank(unsigned i, unsigned j)
-{
-	/* Take a 2D block cyclic distribution */
-	/* NB: p (resp. q) is for "direction" i (resp. j) */
-	return (j % q) * p + (i % p);
-}
-
-starpu_data_handle_t STARPU_PLU(get_block_handle)(unsigned i, unsigned j)
-{
-	return dataA_handles[j+i*nblocks];
-}
-
-int main(int argc, char **argv)
-{
-	int rank;
-	int world_size;
-	int ret;
-	unsigned i, j;
-
-	starpu_srand48((long int)time(NULL));
-
-	parse_args(argc, argv);
-
-	ret = mkdir(path, 0777);
-	if (ret != 0 && errno != EEXIST) {
-		fprintf(stderr,"%s does not exist\n", path);
-		exit(1);
-	}
-
-	ret = starpu_init(NULL);
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-
-	ret = starpu_mpi_init(&argc, &argv, 1);
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
-
-	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-	MPI_Comm_size(MPI_COMM_WORLD, &world_size);
-
-	STARPU_ASSERT(p*q == world_size);
-
-	starpu_cublas_init();
-
-	/*
-	 * 	Problem Init
-	 */
-
-	if (rank == 0)
-		create_matrix();
-
-	starpu_mpi_barrier(MPI_COMM_WORLD);
-
-	init_matrix(rank);
-
-	if (rank == 0)
-		fprintf(stderr, "%dMB on disk\n", (int)(allocated_memory/(1024*1024)));
-
-	TYPE *a_r = NULL;
-//	STARPU_PLU(display_data_content)(a_r, size);
-
-	TYPE *x, *y;
-
-	if (check)
-	{
-		x = calloc(size, sizeof(TYPE));
-		STARPU_ASSERT(x);
-
-		y = calloc(size, sizeof(TYPE));
-		STARPU_ASSERT(y);
-
-		if (rank == 0)
-		{
-			unsigned ind;
-			for (ind = 0; ind < size; ind++)
-				x[ind] = (TYPE)starpu_drand48();
-		}
-
-		a_r = STARPU_PLU(reconstruct_matrix)(size, nblocks);
-
-		if (rank == 0)
-			STARPU_PLU(display_data_content)(a_r, size);
-
-//		STARPU_PLU(compute_ax)(size, x, y, nblocks, rank);
-	}
-
-	double timing = STARPU_PLU(plu_main)(nblocks, rank, world_size);
-
-	/*
-	 * 	Report performance
-	 */
-
-	if (rank == 0)
-	{
-		fprintf(stderr, "Computation took: %f ms\n", timing/1000);
-
-		unsigned n = size;
-		double flop = (2.0f*n*n*n)/3.0f;
-		fprintf(stderr, "Synthetic GFlops : %2.2f\n", (flop/timing/1000.0f));
-	}
-
-	/*
-	 *	Test Result Correctness
-	 */
-
-	if (check)
-	{
-		/*
-		 *	Compute || A - LU ||
-		 */
-
-		STARPU_PLU(compute_lu_matrix)(size, nblocks, a_r);
-
-#if 0
-		/*
-		 *	Compute || Ax - LUx ||
-		 */
-
-		unsigned ind;
-
-		y2 = calloc(size, sizeof(TYPE));
-		STARPU_ASSERT(y);
-
-		if (rank == 0)
-		{
-			for (ind = 0; ind < size; ind++)
-			{
-				y2[ind] = (TYPE)0.0;
-			}
-		}
-
-		STARPU_PLU(compute_lux)(size, x, y2, nblocks, rank);
-
-		/* Compute y2 = y2 - y */
-		CPU_AXPY(size, -1.0, y, 1, y2, 1);
-
-		TYPE err = CPU_ASUM(size, y2, 1);
-		int max = CPU_IAMAX(size, y2, 1);
-
-		fprintf(stderr, "(A - LU)X Avg error : %e\n", err/(size*size));
-		fprintf(stderr, "(A - LU)X Max error : %e\n", y2[max]);
-#endif
-	}
-
-	/*
-	 * 	Termination
-	 */
-	for (j = 0; j < nblocks; j++)
-	{
-		for (i = 0; i < nblocks; i++)
-		{
-			starpu_data_unregister(dataA_handles[j+nblocks*i]);
-		}
-	}
-
-	starpu_cublas_shutdown();
-	starpu_mpi_shutdown();
-	starpu_shutdown();
-
-	return 0;
-}

+ 0 - 19
mpi/examples/mpi_lu/plu_outofcore_example_double.c

@@ -1,19 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2010, 2013  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-#include "mpi_lu-double.h"
-#include "plu_outofcore_example.c"

+ 0 - 19
mpi/examples/mpi_lu/plu_outofcore_example_float.c

@@ -1,19 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2010, 2013  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-#include "mpi_lu-float.h"
-#include "plu_outofcore_example.c"

+ 0 - 19
mpi/examples/mpi_lu/pslu_implicit.c

@@ -1,19 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2010, 2013  Université de Bordeaux 1
- * Copyright (C) 2010  Centre National de la Recherche Scientifique
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-#include "mpi_lu-float.h"
-#include "pxlu_implicit.c"

+ 0 - 162
mpi/examples/mpi_lu/pxlu_implicit.c

@@ -1,162 +0,0 @@
-/* StarPU --- Runtime system for heterogeneous multicore architectures.
- *
- * Copyright (C) 2010, 2011, 2013  Université de Bordeaux 1
- * Copyright (C) 2010, 2012, 2013  Centre National de la Recherche Scientifique
- *
- * StarPU is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published by
- * the Free Software Foundation; either version 2.1 of the License, or (at
- * your option) any later version.
- *
- * StarPU is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
- *
- * See the GNU Lesser General Public License in COPYING.LGPL for more details.
- */
-
-#include "pxlu.h"
-#include "pxlu_kernels.h"
-#include <sys/time.h>
-
-//#define VERBOSE_INIT	1
-
-//#define DEBUG	1
-
-static unsigned no_prio = 0;
-
-static unsigned nblocks = 0;
-static int rank = -1;
-static int world_size = -1;
-
-struct callback_arg {
-	unsigned i, j, k;
-};
-
-/*
- *	Task 11 (diagonal factorization)
- */
-
-static void create_task_11(unsigned k)
-{
-	starpu_mpi_insert_task(MPI_COMM_WORLD,
-			&STARPU_PLU(cl11),
-			STARPU_VALUE, &k, sizeof(k),
-			STARPU_VALUE, &k, sizeof(k),
-			STARPU_VALUE, &k, sizeof(k),
-			STARPU_RW, STARPU_PLU(get_block_handle)(k, k),
-			STARPU_PRIORITY, !no_prio ?
-				STARPU_MAX_PRIO : STARPU_MIN_PRIO,
-			0);
-}
-
-/*
- *	Task 12 (Update lower left (TRSM))
- */
-
-static void create_task_12(unsigned k, unsigned j)
-{
-#warning temporary fix 
-	starpu_mpi_insert_task(MPI_COMM_WORLD,
-			//&STARPU_PLU(cl12),
-			&STARPU_PLU(cl21),
-			STARPU_VALUE, &j, sizeof(j),
-			STARPU_VALUE, &j, sizeof(j),
-			STARPU_VALUE, &k, sizeof(k),
-			STARPU_R, STARPU_PLU(get_block_handle)(k, k),
-			STARPU_RW, STARPU_PLU(get_block_handle)(k, j),
-			STARPU_PRIORITY, !no_prio && (j == k+1) ?
-				STARPU_MAX_PRIO : STARPU_MIN_PRIO,
-			0);
-}
-
-/*
- *	Task 21 (Update upper right (TRSM))
- */
-
-static void create_task_21(unsigned k, unsigned i)
-{
-#warning temporary fix 
-	starpu_mpi_insert_task(MPI_COMM_WORLD,
-			//&STARPU_PLU(cl21),
-			&STARPU_PLU(cl12),
-			STARPU_VALUE, &i, sizeof(i),
-			STARPU_VALUE, &i, sizeof(i),
-			STARPU_VALUE, &k, sizeof(k),
-			STARPU_R, STARPU_PLU(get_block_handle)(k, k),
-			STARPU_RW, STARPU_PLU(get_block_handle)(i, k),
-			STARPU_PRIORITY, !no_prio && (i == k+1) ?
-				STARPU_MAX_PRIO : STARPU_MIN_PRIO,
-			0);
-}
-
-/*
- *	Task 22 (GEMM)
- */
-
-static void create_task_22(unsigned k, unsigned i, unsigned j)
-{
-	starpu_mpi_insert_task(MPI_COMM_WORLD,
-			&STARPU_PLU(cl22),
-			STARPU_VALUE, &i, sizeof(i),
-			STARPU_VALUE, &j, sizeof(j),
-			STARPU_VALUE, &k, sizeof(k),
-			STARPU_R, STARPU_PLU(get_block_handle)(k, j),
-			STARPU_R, STARPU_PLU(get_block_handle)(i, k),
-			STARPU_RW, STARPU_PLU(get_block_handle)(i, j),
-			STARPU_PRIORITY, !no_prio && (i == k + 1) && (j == k +1) ?
-				STARPU_MAX_PRIO : STARPU_MIN_PRIO,
-			0);
-}
-
-/*
- *	code to bootstrap the factorization 
- */
-
-double STARPU_PLU(plu_main)(unsigned _nblocks, int _rank, int _world_size)
-{
-	struct timeval start;
-	struct timeval end;
-
-	nblocks = _nblocks;
-	rank = _rank;
-	world_size = _world_size;
-
-	/* create all the DAG nodes */
-	unsigned i,j,k;
-
-	starpu_mpi_barrier(MPI_COMM_WORLD);
-
-	gettimeofday(&start, NULL);
-
-	for (k = 0; k < nblocks; k++)
-	{
-		create_task_11(k);
-
-		for (i = k+1; i<nblocks; i++)
-		{
-			create_task_12(k, i);
-			create_task_21(k, i);
-		}
-
-		for (i = k+1; i<nblocks; i++)
-		{
-			for (j = k+1; j<nblocks; j++)
-			{
-				create_task_22(k, i, j);
-			}
-		}
-	}
-
-	starpu_task_wait_for_all();
-
-	starpu_mpi_barrier(MPI_COMM_WORLD);
-
-	gettimeofday(&end, NULL);
-
-	double timing = (double)((end.tv_sec - start.tv_sec)*1000000 + (end.tv_usec - start.tv_usec));
-	
-//	fprintf(stderr, "RANK %d -> took %f ms\n", rank, timing/1000);
-	
-	return timing;
-}

+ 1 - 0
mpi/src/starpu_mpi_collective.c

@@ -57,6 +57,7 @@ int starpu_mpi_scatter_detached(starpu_data_handle_t *data_handles, int count, i
 		callback_arg->nb = 0;
 		callback_arg->callback = (rank == root) ? scallback : rcallback;
 		callback_arg->arg = (rank == root) ? sarg : rarg;
+		if (callback_arg->callback == NULL)
 
 		for(x = 0; x < count ; x++)
 		{

+ 0 - 3
mpi/tests/Makefile.am

@@ -198,8 +198,5 @@ mpi_reduction_SOURCES += mpi_reduction_kernels.c
 user_defined_datatype_SOURCES = user_defined_datatype.c
 user_defined_datatype_SOURCES += $(top_srcdir)/examples/interface/complex_interface.c
 
-mpi_earlyrecv2_SOURCES = mpi_earlyrecv2.c
-mpi_earlyrecv2_SOURCES += $(top_srcdir)/examples/interface/complex_interface.c
-
 showcheck:
 	-cat $(TEST_LOGS) /dev/null

+ 41 - 159
mpi/tests/mpi_earlyrecv2.c

@@ -18,199 +18,81 @@
 #include <starpu_mpi.h>
 #include "helper.h"
 #include <unistd.h>
-#include <interface/complex_interface.h>
 
+//#define NB 1000
 #define NB 10
 
-static starpu_pthread_mutex_t mutex = STARPU_PTHREAD_MUTEX_INITIALIZER;
-static starpu_pthread_cond_t cond = STARPU_PTHREAD_COND_INITIALIZER;
-
-void callback(void *arg)
+int main(int argc, char **argv)
 {
-	unsigned *received = arg;
+	int ret, rank, size, i;
+	starpu_data_handle_t tab_handle[NB];
+	int value[NB];
 
-	STARPU_PTHREAD_MUTEX_LOCK(&mutex);
-	*received = *received + 1;
-	FPRINTF_MPI("Requests %d received\n", *received);
-	STARPU_PTHREAD_COND_SIGNAL(&cond);
-	STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
-}
+	MPI_Init(NULL, NULL);
+	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+	MPI_Comm_size(MPI_COMM_WORLD, &size);
 
-typedef void (*check_func)(starpu_data_handle_t handle, int i, int rank, int *error);
+	if (size%2 != 0)
+	{
+		if (rank == 0)
+			FPRINTF(stderr, "We need a even number of processes.\n");
+
+		MPI_Finalize();
+		return STARPU_TEST_SKIPPED;
+	}
+
+	ret = starpu_init(NULL);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
+	ret = starpu_mpi_init(NULL, NULL, 0);
+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
+
+	for(i=0 ; i<NB ; i++)
+	{
+		value[i]=i*rank;
+		starpu_variable_data_register(&tab_handle[i], STARPU_MAIN_RAM, (uintptr_t)&value[i], sizeof(int));
+		starpu_data_set_tag(tab_handle[i], i);
+	}
 
-int exchange(int rank, starpu_data_handle_t *handles, check_func func, int detached)
-{
 	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
-	int i;
 
 	if (rank%2)
 	{
-		starpu_mpi_send(handles[0], other_rank, 0, MPI_COMM_WORLD);
-		starpu_mpi_send(handles[NB-1], other_rank, NB-1, MPI_COMM_WORLD);
+		starpu_mpi_send(tab_handle[0], other_rank, 0, MPI_COMM_WORLD);
+		starpu_mpi_send(tab_handle[NB-1], other_rank, NB-1, MPI_COMM_WORLD);
 		for(i=1 ; i<NB-1 ; i++)
 		{
-			starpu_mpi_send(handles[i], other_rank, i, MPI_COMM_WORLD);
+			starpu_mpi_send(tab_handle[i], other_rank, i, MPI_COMM_WORLD);
 		}
-		return 0;
 	}
 	else
 	{
-		int ret=0;
 		starpu_mpi_req req[NB];
-		int received = 0;
-
-		if (detached)
-		{
-			starpu_mpi_irecv_detached(handles[0], other_rank, 0, MPI_COMM_WORLD, callback, &received);
-		}
-		else
-		{
-			memset(req, 0, NB*sizeof(starpu_mpi_req));
-			starpu_mpi_irecv(handles[0], &req[0], other_rank, 0, MPI_COMM_WORLD);
-			STARPU_ASSERT(req[0] != NULL);
-		}
+		memset(req, 0, NB*sizeof(starpu_mpi_req));
 
+		starpu_mpi_irecv(tab_handle[0], &req[0], other_rank, 0, MPI_COMM_WORLD);
+		STARPU_ASSERT(req[0] != NULL);
 		// We sleep to make sure that the data for the tag 9 will be received before the recv is posted
 		usleep(2000000);
 		for(i=1 ; i<NB ; i++)
 		{
-			if (detached)
-			{
-				starpu_mpi_irecv_detached(handles[i], other_rank, i, MPI_COMM_WORLD, callback, &received);
-			}
-			else
-			{
-				starpu_mpi_irecv(handles[i], &req[i], other_rank, i, MPI_COMM_WORLD);
-				STARPU_ASSERT(req[i] != NULL);
-			}
-		}
-
-		if (detached)
-		{
-			STARPU_PTHREAD_MUTEX_LOCK(&mutex);
-			while (received != NB)
-			{
-			     FPRINTF_MPI("Received %d messages\n", received);
-			     STARPU_PTHREAD_COND_WAIT(&cond, &mutex);
-			}
-			STARPU_PTHREAD_MUTEX_UNLOCK(&mutex);
+			starpu_mpi_irecv(tab_handle[i], &req[i], other_rank, i, MPI_COMM_WORLD);
+			STARPU_ASSERT(req[i] != NULL);
 		}
-		else
+		for(i=0 ; i<NB ; i++)
 		{
-			for(i=0 ; i<NB ; i++)
-			{
-			     starpu_mpi_wait(&req[i], NULL);
-			     func(handles[i], i, rank, &ret);
-			}
+			starpu_mpi_wait(&req[i], NULL);
+			int *rvalue = (int *)starpu_data_get_local_ptr(tab_handle[i]);
+			STARPU_ASSERT_MSG(*rvalue==i*other_rank, "Incorrect received value: %d != %d\n", *rvalue, i*other_rank);
 		}
-		return ret;
 	}
-}
-
-void check_variable(starpu_data_handle_t handle, int i, int rank, int *error)
-{
-	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
-
-	int *rvalue = (int *)starpu_data_get_local_ptr(handle);
-	if (*rvalue != i*other_rank)
-	{
-		FPRINTF_MPI("Incorrect received value: %d != %d\n", *rvalue, i*other_rank);
-		*error = 1;
-	}
-}
-
-int exchange_variable(int rank, int detached)
-{
-	int ret, i;
-	starpu_data_handle_t tab_handle[NB];
-	int value[NB];
 
-	FPRINTF_MPI("Exchanging variable data with detached=%d\n", detached);
-
-	for(i=0 ; i<NB ; i++)
-	{
-		value[i]=i*rank;
-		starpu_variable_data_register(&tab_handle[i], STARPU_MAIN_RAM, (uintptr_t)&value[i], sizeof(int));
-		starpu_data_set_tag(tab_handle[i], i);
-	}
-	ret = exchange(rank, tab_handle, check_variable, detached);
 	for(i=0 ; i<NB ; i++)
 		starpu_data_unregister(tab_handle[i]);
 
-	return ret;
-}
-
-void check_complex(starpu_data_handle_t handle, int i, int rank, int *error)
-{
-	double *real = starpu_complex_get_real(handle);
-	double *imaginary = starpu_complex_get_imaginary(handle);
-
-	int other_rank = rank%2 == 0 ? rank+1 : rank-1;
-
-	if ((*real != ((i*other_rank)+12)) || (*imaginary != ((i*other_rank)+45)))
-	{
-		FPRINTF_MPI("Incorrect received value: %f != %d || %f != %d\n", *real, ((i*other_rank)+12), *imaginary, ((i*other_rank)+45));
-		*error = 1;
-	}
-}
-
-int exchange_complex(int rank, int detached)
-{
-	int ret, i;
-	starpu_data_handle_t handle[NB];
-	double real[NB];
-	double imaginary[NB];
-
-	FPRINTF_MPI("Exchanging complex data with detached=%d\n", detached);
-
-	for(i=0 ; i<NB ; i++)
-	{
-		real[i] = (i*rank)+12;
-		imaginary[i] = (i*rank)+45;
-		starpu_complex_data_register(&handle[i], STARPU_MAIN_RAM, &real[i], &imaginary[i], 1);
-		starpu_data_set_tag(handle[i], i);
-	}
-	ret = exchange(rank, handle, check_complex, detached);
-	for(i=0 ; i<NB ; i++)
-		starpu_data_unregister(handle[i]);
-
-	return ret;
-}
-
-int main(int argc, char **argv)
-{
-	int ret, rank, size;
-
-	MPI_Init(NULL, NULL);
-	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
-	MPI_Comm_size(MPI_COMM_WORLD, &size);
-
-	if (size%2 != 0)
-	{
-		if (rank == 0)
-			FPRINTF(stderr, "We need a even number of processes.\n");
-
-		MPI_Finalize();
-		return STARPU_TEST_SKIPPED;
-	}
-
-	ret = starpu_init(NULL);
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
-	ret = starpu_mpi_init(NULL, NULL, 0);
-	STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
-
-	ret = exchange_variable(rank, 0);
-	if (ret == 0)
-		ret = exchange_variable(rank, 1);
-	if (ret == 0)
-		ret = exchange_complex(rank, 0);
-	if (ret == 0)
-		ret = exchange_complex(rank, 1);
-
 	starpu_mpi_shutdown();
 	starpu_shutdown();
 
 	MPI_Finalize();
 
-	return ret;
+	return 0;
 }

+ 4 - 0
src/common/barrier_counter.h

@@ -14,6 +14,9 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
+#ifndef __BARRIER_COUNTER_H__
+#define __BARRIER_COUNTER_H__
+
 #include <common/utils.h>
 #include <common/barrier.h>
 
@@ -37,3 +40,4 @@ int _starpu_barrier_counter_increment_until_full_counter(struct _starpu_barrier_
 
 int _starpu_barrier_counter_increment(struct _starpu_barrier_counter *barrier_c);
 
+#endif

+ 21 - 3
src/core/combined_workers.c

@@ -67,9 +67,14 @@ int starpu_combined_worker_assign_workerid(int nworkers, int workerid_array[])
 	{
 		int id = workerid_array[i];
 
+#ifdef STARPU_USE_MIC
+		STARPU_ASSERT(config->workers[id].arch == STARPU_CPU_WORKER || config->workers[id].arch == STARPU_MIC_WORKER);
+		STARPU_ASSERT(config->workers[id].worker_mask == STARPU_CPU || config->workers[id].worker_mask == STARPU_MIC);
+#else/* STARPU_USE_MIC */
 		/* We only combine CPUs */
-		STARPU_ASSERT(config->workers[id].perf_arch == STARPU_CPU_DEFAULT);
+		STARPU_ASSERT(config->workers[id].arch == STARPU_CPU_WORKER);
 		STARPU_ASSERT(config->workers[id].worker_mask == STARPU_CPU);
+#endif /* STARPU_USE_MIC */
 
 		/* We only combine valid "basic" workers */
 		if ((id < 0) || (id >= basic_worker_count))
@@ -95,8 +100,21 @@ int starpu_combined_worker_assign_workerid(int nworkers, int workerid_array[])
 		&config->combined_workers[combined_worker_id];
 
 	combined_worker->worker_size = nworkers;
-	combined_worker->perf_arch = (enum starpu_perfmodel_archtype) (STARPU_CPU_DEFAULT + nworkers - 1);
-	combined_worker->worker_mask = STARPU_CPU;
+
+#ifdef STARPU_USE_MIC
+	if(config->workers[workerid_array[0]].worker_mask == STARPU_MIC)
+	{
+		combined_worker->perf_arch = (enum starpu_perfmodel_archtype) (STARPU_MIC_DEFAULT + config->workers[workerid_array[0]].mp_nodeid /* *STARPU_MAXMICCPUS + nworkers - 1*/);
+		combined_worker->worker_mask = STARPU_MIC;
+	}
+#endif
+	if(config->workers[workerid_array[0]].worker_mask == STARPU_CPU)
+	{
+		combined_worker->perf_arch = (enum starpu_perfmodel_archtype) (STARPU_CPU_DEFAULT + nworkers - 1);
+		combined_worker->worker_mask = STARPU_CPU;
+	}
+	combined_worker->count = nworkers -1;
+	pthread_mutex_init(&combined_worker->count_mutex,NULL);
 
 	/* We assume that the memory node should either be that of the first
 	 * entry, and it is very likely that every worker in the combination

+ 86 - 25
src/core/detect_combined_workers.c

@@ -191,58 +191,119 @@ static void find_and_assign_combinations_with_hwloc(int *workerids, int nworkers
 
 #else /* STARPU_HAVE_HWLOC */
 
+static void assign_combinations_without_hwloc(struct starpu_worker_collection* worker_collection, int* workers, unsigned n, int min, int max)
+{
+
+	int size,i,count =0;
+	//if the maximun number of worker is already reached
+	if(worker_collection->nworkers >= STARPU_NMAXWORKERS - 1)
+		return;
+
+	for (size = min; size <= max; size *= 2)
+	{
+		unsigned first;
+		for (first = 0; first < n; first += size)
+		{
+			if (first + size <= n)
+			{
+				int found_workerids[size];
+
+				for (i = 0; i < size; i++)
+					found_workerids[i] = workers[first + i];
+
+				/* We register this combination */
+				int newworkerid;
+				newworkerid = starpu_combined_worker_assign_workerid(size, found_workerids);
+				STARPU_ASSERT(newworkerid >= 0);
+				count++;
+				worker_collection->add(worker_collection, newworkerid);
+				//if the maximun number of worker is reached, then return
+				if(worker_collection->nworkers >= STARPU_NMAXWORKERS - 1)
+					return;
+			}
+		}
+	}
+}
+
+
 static void find_and_assign_combinations_without_hwloc(int *workerids, int nworkers)
 {
+	int i;
+	unsigned j;
 	unsigned sched_ctx_id  = starpu_sched_ctx_get_context();
 	if(sched_ctx_id == STARPU_NMAX_SCHED_CTXS)
 		sched_ctx_id = 0;
-	int min;
-	int max;
+	int min, max, mic_min, mic_max;
 
 	struct starpu_worker_collection* workers = starpu_sched_ctx_get_worker_collection(sched_ctx_id);
 
 	/* We put the id of all CPU workers in this array */
 	int cpu_workers[STARPU_NMAXWORKERS];
 	unsigned ncpus = 0;
+#ifdef STARPU_USE_MIC
+	unsigned nb_mics = _starpu_get_machine_config()->topology.nmicdevices;
+	unsigned * nmics_table;
+	int * mic_id;
+	int ** mic_workers;
+	mic_id = malloc(sizeof(int)*nb_mics);
+	nmics_table = malloc(sizeof(unsigned)*nb_mics);
+	mic_workers = malloc(sizeof(int*)*nb_mics);
+	for(i=0; i<nb_mics; i++)
+	{
+		mic_id[i] = -1;
+		nmics_table[i] = 0;
+		mic_workers[i] = malloc(sizeof(int)*STARPU_NMAXWORKERS);
+	}
+#endif /* STARPU_USE_MIC */
 
 	struct _starpu_worker *worker;
-	int i;
 	for (i = 0; i < nworkers; i++)
 	{
 		worker = _starpu_get_worker_struct(workerids[i]);
-
-		if (worker->perf_arch == STARPU_CPU_DEFAULT)
+		if (worker->arch == STARPU_CPU_WORKER)
 			cpu_workers[ncpus++] = i;
+#ifdef STARPU_USE_MIC
+		else if(worker->arch == STARPU_MIC_WORKER)
+		{
+			for(j=0; mic_id[j] != worker->mp_nodeid && mic_id[j] != -1 && j<nb_mics; j++);
+			if(j<nb_mics)
+			{
+				if(mic_id[j] == -1)
+				{
+					mic_id[j] = worker->mp_nodeid;					
+				}
+				mic_workers[j][nmics_table[j]++] = i;
+			}
+		}
+#endif /* STARPU_USE_MIC */
+
 	}
 
+
 	min = starpu_get_env_number("STARPU_MIN_WORKERSIZE");
 	if (min < 2)
 		min = 2;
 	max = starpu_get_env_number("STARPU_MAX_WORKERSIZE");
 	if (max == -1 || max > (int) ncpus)
 		max = ncpus;
-
-	int size;
-	for (size = min; size <= max; size *= 2)
+	
+	assign_combinations_without_hwloc(workers,cpu_workers,ncpus,min,max);
+#ifdef STARPU_USE_MIC
+	mic_min = starpu_get_env_number("STARPU_MIN_WORKERSIZE");
+	if (mic_min < 2)
+		mic_min = 2;
+	for(i=0; i<nb_mics; i++)
 	{
-		unsigned first_cpu;
-		for (first_cpu = 0; first_cpu < ncpus; first_cpu += size)
-		{
-			if (first_cpu + size <= ncpus)
-			{
-				int found_workerids[size];
-
-				for (i = 0; i < size; i++)
-					found_workerids[i] = cpu_workers[first_cpu + i];
-
-				/* We register this combination */
-				int newworkerid;
-				newworkerid = starpu_combined_worker_assign_workerid(size, found_workerids);
-				STARPU_ASSERT(newworkerid >= 0);
-				workers->add(workers, newworkerid);
-			}
-		}
+		mic_max = starpu_get_env_number("STARPU_MAX_WORKERSIZE");
+		if (mic_max == -1 || mic_max > (int) nmics_table[i])
+			mic_max = nmics_table[i];
+		assign_combinations_without_hwloc(workers,mic_workers[i],nmics_table[i],mic_min,mic_max);
+		free(mic_workers[i]);
 	}
+	free(mic_id);
+	free(nmics_table);
+	free(mic_workers);
+#endif /* STARPU_USE_MIC */
 }
 
 #endif /* STARPU_HAVE_HWLOC */

+ 4 - 1
src/core/jobs.c

@@ -129,9 +129,12 @@ void _starpu_wait_job(struct _starpu_job *j)
 	 * way, _starpu_wait_job won't return until the entire task was really
 	 * executed (so that we cannot destroy the task while it is still being
 	 * manipulated by the driver). */
+
 	while (j->terminated != 2)
+	{
 		STARPU_PTHREAD_COND_WAIT(&j->sync_cond, &j->sync_mutex);
-
+	}
+	
 	STARPU_PTHREAD_MUTEX_UNLOCK(&j->sync_mutex);
         _STARPU_LOG_OUT();
 }

+ 0 - 1
src/core/sched_policy.c

@@ -595,7 +595,6 @@ pick:
 			else
 				sched_ctx = _get_next_sched_ctx_to_pop_into(worker);
 
-
 			if(sched_ctx && sched_ctx->id != STARPU_NMAX_SCHED_CTXS)
 			{
 				if (sched_ctx->sched_policy && sched_ctx->sched_policy->pop_task)

+ 6 - 2
src/core/topology.c

@@ -682,8 +682,8 @@ _starpu_init_mp_config (struct _starpu_machine_config *config,
 		if (0 == _starpu_init_mic_node (config, i, &handles[i], &process[i]))
 			topology->nmicdevices++;
 
-	i = 0;
-	for (; i < topology->nmicdevices; i++)
+	
+	for (i = 0; i < topology->nmicdevices; i++)
 		_starpu_init_mic_config (config, user_conf, i);
 #endif
 }
@@ -1094,6 +1094,9 @@ _starpu_bind_thread_on_cpus (
 		}
 	}
 #else
+#ifdef __GLIBC__
+	pthread_setaffinity_np(pthread_self(),sizeof(cpu_set_t),&combined_worker->cpu_set);
+#endif
 #warning no parallel worker CPU binding support
 #endif
 }
@@ -1260,6 +1263,7 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_c
 
 		if (is_a_set_of_accelerators)
 		{
+/* TODO: il faudrait changer quand on change de device */
 			if (accelerator_bindid == -1)
 				accelerator_bindid = _starpu_get_next_bindid(config, preferred_binding, npreferred);
 

+ 9 - 3
src/core/workers.c

@@ -293,10 +293,15 @@ int starpu_combined_worker_can_execute_task(unsigned workerid, struct starpu_tas
 	}
 	else
 	{
-		if ((cl->type == STARPU_SPMD)
+		if ((cl->type == STARPU_SPMD) 
 #ifdef STARPU_HAVE_HWLOC
 				|| (cl->type == STARPU_FORKJOIN)
+#else
+#ifdef __GLIBC__
+				|| (cl->type == STARPU_FORKJOIN)
+#endif
 #endif
+
 				)
 		{
 			/* TODO we should add other types of constraints */
@@ -434,7 +439,6 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 #ifdef HAVE_AYUDAME_H
 	if (AYU_event) AYU_event(AYU_INIT, 0, NULL);
 #endif
-
 	for (worker = 0; worker < nworkers; worker++)
 	{
 		struct _starpu_worker *workerarg = &pconfig->workers[worker];
@@ -1418,13 +1422,15 @@ unsigned starpu_worker_is_combined_worker(int id)
 
 struct _starpu_sched_ctx *_starpu_get_sched_ctx_struct(unsigned id)
 {
-	if (id == STARPU_NMAX_SCHED_CTXS) return NULL;
+	if(id == STARPU_NMAX_SCHED_CTXS) return NULL;
 	return &config.sched_ctxs[id];
 }
 
 struct _starpu_combined_worker *_starpu_get_combined_worker_struct(unsigned id)
 {
 	unsigned basic_worker_count = starpu_worker_get_count();
+	
+	//_STARPU_DEBUG("basic_worker_count:%d\n",basic_worker_count);
 
 	STARPU_ASSERT(id >= basic_worker_count);
 	return &config.combined_workers[id - basic_worker_count];

+ 4 - 0
src/core/workers.h

@@ -123,6 +123,10 @@ struct _starpu_combined_worker
 	int worker_size;
 	unsigned memory_node; /* which memory node is associated that worker to ? */
 	int combined_workerid[STARPU_NMAXWORKERS];
+#ifdef STARPU_USE_MIC 
+	int count;
+	pthread_mutex_t count_mutex;
+#endif
 
 #ifdef __GLIBC__
 	cpu_set_t cpu_set;

+ 2 - 1
src/datawizard/data_request.c

@@ -358,6 +358,7 @@ static int starpu_handle_data_request(struct _starpu_data_request *r, unsigned m
 	/* perform the transfer */
 	/* the header of the data must be locked by the worker that submitted the request */
 
+
 	r->retval = _starpu_driver_copy_data_1_to_1(handle, src_replicate,
 						    dst_replicate, !(r_mode & STARPU_R), r, may_alloc);
 
@@ -387,7 +388,7 @@ static int starpu_handle_data_request(struct _starpu_data_request *r, unsigned m
 	/* the request has been handled */
 	_starpu_spin_lock(&r->lock);
 	starpu_handle_data_request_completion(r);
-
+_STARPU_DEBUG("MEMORY!\n");
 	return 0;
 }
 

+ 0 - 2
src/datawizard/filters.c

@@ -258,9 +258,7 @@ void starpu_data_partition(starpu_data_handle_t initial_handle, struct starpu_da
 		void *ptr;
 		ptr = starpu_data_handle_to_pointer(child, 0);
 		if (ptr != NULL)
-		{
 			_starpu_data_register_ram_pointer(child, ptr);
-		}
 	}
 	/* now let the header */
 	_starpu_spin_unlock(&initial_handle->header_lock);

+ 9 - 11
src/datawizard/reduction.c

@@ -86,18 +86,16 @@ void _starpu_redux_init_data_replicate(starpu_data_handle_t handle, struct _star
 #ifdef STARPU_USE_MIC
 	if (starpu_worker_get_type(workerid) == STARPU_MIC_WORKER)
 	{
-		const struct _starpu_mp_node *node = _starpu_mic_src_get_actual_thread_mp_node();
-		enum _starpu_mp_command answer;
-		void *arg = NULL;
-		int arg_size = 0;
-
-		// XXX: give the correct coreid.
+		struct _starpu_mp_node *node = _starpu_mic_src_get_actual_thread_mp_node();
+		int devid = _starpu_get_worker_struct(workerid)->devid;
+		void * arg;
+		int arg_size;
 		_starpu_src_common_execute_kernel(node,
-						  (void(*)(void))init_func, 0,
-						  &handle, &(replicate->data_interface), 1,
-						  NULL, 0);
-		answer = _starpu_mp_common_recv_command (node, &arg, &arg_size);
-		STARPU_ASSERT (answer == STARPU_EXECUTION_COMPLETED);
+						 (void(*)(void))init_func, devid,
+						 STARPU_SEQ, 0, 0, &handle, 
+						 &(replicate->data_interface), 1,
+						 NULL, 0);
+		_starpu_src_common_wait_completed_execution(node,devid,&arg,&arg_size);
 	}
 	else
 #endif

+ 88 - 16
src/drivers/driver_common/driver_common.c

@@ -95,7 +95,7 @@ void _starpu_driver_end_job(struct _starpu_worker *args, struct _starpu_job *j,
 	}
 
 	if (starpu_top)
-	  _starpu_top_task_ended(task,workerid,codelet_end);
+		_starpu_top_task_ended(task,workerid,codelet_end);
 
 	args->status = STATUS_UNKNOWN;
 }
@@ -129,9 +129,9 @@ void _starpu_driver_update_job_feedback(struct _starpu_job *j, struct _starpu_wo
 			profiling_info->workerid = workerid;
 
 			_starpu_worker_update_profiling_info_executing(workerid, &measured_ts, 1,
-				profiling_info->used_cycles,
-				profiling_info->stall_cycles,
-				profiling_info->power_consumed);
+								       profiling_info->used_cycles,
+								       profiling_info->stall_cycles,
+								       profiling_info->power_consumed);
 			updated =  1;
 		}
 
@@ -150,6 +150,29 @@ void _starpu_driver_update_job_feedback(struct _starpu_job *j, struct _starpu_wo
 	}
 }
 
+
+
+static void _starpu_worker_set_status_sleeping(int workerid)
+{
+	if (_starpu_worker_get_status(workerid) != STATUS_SLEEPING)
+	{
+		_STARPU_TRACE_WORKER_SLEEP_START;
+		_starpu_worker_restart_sleeping(workerid);
+		_starpu_worker_set_status(workerid, STATUS_SLEEPING);
+	}
+
+}
+
+static void _starpu_worker_set_status_wakeup(int workerid)
+{
+	if (_starpu_worker_get_status(workerid) == STATUS_SLEEPING)
+	{
+		_STARPU_TRACE_WORKER_SLEEP_END;
+		_starpu_worker_stop_sleeping(workerid);
+		_starpu_worker_set_status(workerid, STATUS_UNKNOWN);
+	}
+}
+
 /* Workers may block when there is no work to do at all. */
 struct starpu_task *_starpu_get_worker_task(struct _starpu_worker *args, int workerid, unsigned memnode)
 {
@@ -175,12 +198,7 @@ struct starpu_task *_starpu_get_worker_task(struct _starpu_worker *args, int wor
 		 * driver may go block just after the scheduler got a new task to be
 		 * executed, and thus hanging. */
 
-		if (_starpu_worker_get_status(workerid) != STATUS_SLEEPING)
-		{
-			_STARPU_TRACE_WORKER_SLEEP_START;
-			_starpu_worker_restart_sleeping(workerid);
-			_starpu_worker_set_status(workerid, STATUS_SLEEPING);
-		}
+		_starpu_worker_set_status_sleeping(workerid);
 
 		if (_starpu_worker_can_block(memnode))
 			STARPU_PTHREAD_COND_WAIT(&args->sched_cond, &args->sched_mutex);
@@ -235,12 +253,7 @@ struct starpu_task *_starpu_get_worker_task(struct _starpu_worker *args, int wor
 		perf_counters->notify_idle_end(task->sched_ctx, args->workerid);
 #endif //STARPU_USE_SC_HYPERVISOR
 
-	if (_starpu_worker_get_status(workerid) == STATUS_SLEEPING)
-	{
-		_STARPU_TRACE_WORKER_SLEEP_END;
-		_starpu_worker_stop_sleeping(workerid);
-		_starpu_worker_set_status(workerid, STATUS_UNKNOWN);
-	}
+	_starpu_worker_set_status_wakeup(workerid);
 
 #ifdef HAVE_AYUDAME_H
 	if (AYU_event)
@@ -252,3 +265,62 @@ struct starpu_task *_starpu_get_worker_task(struct _starpu_worker *args, int wor
 
 	return task;
 }
+
+
+int _starpu_get_multi_worker_task(struct _starpu_worker *workers, struct starpu_task ** tasks, int nworkers)
+{
+	int i, count = 0;
+	struct _starpu_job * j;
+	int is_parallel_task;
+	struct _starpu_combined_worker *combined_worker;
+	/*for each worker*/
+	for (i = 0; i < nworkers; i++)
+	{
+		/*if the worker is already executinf a task then */
+		if(workers[i].current_task)
+		{
+			tasks[i] = NULL;
+		}
+		/*else try to pop a task*/
+		else
+		{
+			STARPU_PTHREAD_MUTEX_LOCK(&workers[i].sched_mutex);
+			_starpu_set_local_worker_key(&workers[i]);
+			tasks[i] = _starpu_pop_task(&workers[i]);
+			STARPU_PTHREAD_MUTEX_UNLOCK(&workers[i].sched_mutex);
+			if(tasks[i] != NULL)
+			{
+				count ++;
+				j = _starpu_get_job_associated_to_task(tasks[i]);
+				is_parallel_task = (j->task_size > 1);
+				workers[i].current_task = j->task;
+				/* Get the rank in case it is a parallel task */
+				if (is_parallel_task)
+				{
+
+					STARPU_PTHREAD_MUTEX_LOCK(&j->sync_mutex);
+					workers[i].current_rank = j->active_task_alias_count++;
+					STARPU_PTHREAD_MUTEX_UNLOCK(&j->sync_mutex);
+					
+					combined_worker = _starpu_get_combined_worker_struct(j->combined_workerid);
+					workers[i].combined_workerid = j->combined_workerid;
+					workers[i].worker_size = combined_worker->worker_size;
+				}
+				else
+				{
+					workers[i].combined_workerid = workers[i].workerid;
+					workers[i].worker_size = 1;
+					workers[i].current_rank = 0;
+				}
+
+				_starpu_worker_set_status_wakeup(workers[i].workerid);
+			}
+			else
+			{
+				_starpu_worker_set_status_sleeping(workers[i].workerid);
+			}
+		}
+	}
+	return count;
+}
+

+ 1 - 1
src/drivers/driver_common/driver_common.h

@@ -32,5 +32,5 @@ void _starpu_driver_update_job_feedback(struct _starpu_job *j, struct _starpu_wo
 					struct timespec *codelet_start, struct timespec *codelet_end, int profiling);
 
 struct starpu_task *_starpu_get_worker_task(struct _starpu_worker *args, int workerid, unsigned memnode);
-
+int _starpu_get_multi_worker_task(struct _starpu_worker *workers, struct starpu_task ** tasks, int nworker);
 #endif // __DRIVER_COMMON_H__

+ 18 - 4
src/drivers/mic/driver_mic_common.c

@@ -19,7 +19,6 @@
 #include <drivers/mp_common/mp_common.h>
 #include <drivers/mic/driver_mic_common.h>
 
-
 void _starpu_mic_common_report_scif_error(const char *func, const char *file, const int line, const int status)
 {
 	const char *errormsg = strerror(status);
@@ -33,10 +32,25 @@ void _starpu_mic_common_report_scif_error(const char *func, const char *file, co
 
 void _starpu_mic_common_send(const struct _starpu_mp_node *node, void *msg, int len)
 {
-	if ((scif_send(node->mp_connection.mic_endpoint, msg, len, SCIF_SEND_BLOCK)) < 0)
+  if ((scif_send(node->mp_connection.mic_endpoint, msg, len, SCIF_SEND_BLOCK)) < 0)
 		STARPU_MP_COMMON_REPORT_ERROR(node, errno);
 }
 
+
+/* Teel is the mic endpoint is ready
+ * return 1 if a message has been receive, 0 if no message has been receive
+ */
+int _starpu_mic_common_recv_is_ready(const struct _starpu_mp_node *mp_node)
+{
+  struct scif_pollepd pollepd;
+  pollepd.epd = mp_node->mp_connection.mic_endpoint;
+  pollepd.events = SCIF_POLLIN;
+  pollepd.revents = 0;
+  return  scif_poll(&pollepd,1,0);
+	
+}
+
+
 /* Handles the error so the caller (which must be generic) doesn't have to
  * care about it.
  */
@@ -49,7 +63,7 @@ void _starpu_mic_common_recv(const struct _starpu_mp_node *node, void *msg, int
 
 /* Handles the error so the caller (which must be generic) doesn't have to
  * care about it.
- */
+x */
 void _starpu_mic_common_dt_send(const struct _starpu_mp_node *mp_node, void *msg, int len)
 {
 	if ((scif_send(mp_node->host_sink_dt_connection.mic_endpoint, msg, len, SCIF_SEND_BLOCK)) < 0)
@@ -114,7 +128,7 @@ void _starpu_mic_common_accept(scif_epd_t *endpoint, uint16_t port_number)
 	_STARPU_DEBUG("MIC accepting connection on %u...\n", port_number);
 	if ((scif_accept(init_epd, &portID, endpoint, SCIF_ACCEPT_SYNC)) < 0)
 		STARPU_MIC_COMMON_REPORT_SCIF_ERROR(errno);
-	_STARPU_DEBUG("done\n", init_epd);
+	_STARPU_DEBUG("done : %d\n", init_epd);
 
 	scif_close(init_epd);
 }

+ 2 - 0
src/drivers/mic/driver_mic_common.h

@@ -53,6 +53,8 @@ struct _starpu_mic_free_command
 
 void _starpu_mic_common_report_scif_error(const char *func, const char *file, int line, const int status);
 
+int _starpu_mic_common_recv_is_ready(const struct _starpu_mp_node *mp_node);
+
 void _starpu_mic_common_send(const struct _starpu_mp_node *node, void *msg, int len);
 
 void _starpu_mic_common_recv(const struct _starpu_mp_node *node, void *msg, int len);

+ 88 - 14
src/drivers/mic/driver_mic_sink.c

@@ -16,12 +16,14 @@
 
 
 #include <errno.h>
+#include <dlfcn.h>
 
 #include <common/COISysInfo_common.h>
 
 #include <starpu.h>
 #include <drivers/mp_common/mp_common.h>
 #include <drivers/mp_common/sink_common.h>
+#include <datawizard/interfaces/data_interface.h>
 
 #include "driver_mic_common.h"
 #include "driver_mic_sink.h"
@@ -29,17 +31,27 @@
 /* Initialize the MIC sink, initializing connection to the source
  * and to the other devices (not implemented yet).
  */
-
 void _starpu_mic_sink_init(struct _starpu_mp_node *node)
 {
-	//unsigned int i;
-	
+	pthread_t self;
+	cpu_set_t cpuset;
+
+	/*Bind on the first core*/
+	self = pthread_self();
+	CPU_ZERO(&cpuset);
+	CPU_SET(241,&cpuset);
+	pthread_setaffinity_np(self,sizeof(cpu_set_t),&cpuset);
+
+
 	/* Initialize connection with the source */
 	_starpu_mic_common_accept(&node->mp_connection.mic_endpoint,
 					 STARPU_MIC_SOURCE_PORT_NUMBER);
 
 	_starpu_mic_common_accept(&node->host_sink_dt_connection.mic_endpoint,
 									 STARPU_MIC_SOURCE_DT_PORT_NUMBER);
+	
+	node->nb_cores = COISysGetHardwareThreadCount() - COISysGetHardwareThreadCount() / COISysGetCoreCount();
+	node->thread_table = malloc(sizeof(pthread_t)*node->nb_cores);
 
 	//node->sink_sink_dt_connections = malloc(node->nb_mp_sinks * sizeof(union _starpu_mp_connection));
 
@@ -54,11 +66,58 @@ void _starpu_mic_sink_init(struct _starpu_mp_node *node)
 	//								STARPU_MIC_SINK_SINK_DT_PORT_NUMBER(node->devid, i));
 }
 
-/* Deinitialize the MIC sink, close all the connections.
+/* Launch all workers on the mic
  */
+void _starpu_mic_sink_launch_workers(struct _starpu_mp_node *node)
+{
+	int i, ret;
+	struct arg_sink_thread * arg;
+	cpu_set_t cpuset;
+	pthread_attr_t attr;
+	pthread_t thread;
+	
+	/*for each core init the mutex, the task pointer and launch the thread */
+	for(i=0; i<node->nb_cores; i++)
+	{
+		//init the set
+		CPU_ZERO(&cpuset);
+		CPU_SET(i,&cpuset);
+
+		ret = pthread_attr_init(&attr);
+		STARPU_ASSERT(ret == 0);
+		ret = pthread_attr_setaffinity_np(&attr, sizeof(cpu_set_t), &cpuset);
+		STARPU_ASSERT(ret == 0);
+
+		/*prepare the argument for the thread*/
+		arg= malloc(sizeof(struct arg_sink_thread));
+		arg->coreid = i;
+		arg->node = node;
+		
+		ret = pthread_create(&thread, &attr, _starpu_sink_thread, arg);
+		STARPU_ASSERT(ret == 0);
+		((pthread_t *)node->thread_table)[i] = thread;
+	}
+
+}
 
+/* Deinitialize the MIC sink, close all the connections.
+ */
 void _starpu_mic_sink_deinit(struct _starpu_mp_node *node)
 {
+	
+	int i;
+	node->is_running = 0;
+	for(i=0; i<node->nb_cores; i++)
+	{
+		sem_post(&node->sem_run_table[i]);
+		pthread_join(((pthread_t *)node->thread_table)[i],NULL);
+	}
+
+	free(node->thread_table);
+
+	scif_close(node->host_sink_dt_connection.mic_endpoint);
+	scif_close(node->mp_connection.mic_endpoint);
+
 	//unsigned int i;
 
 	//for (i = 0; i < node->nb_mp_sinks; ++i)
@@ -69,14 +128,11 @@ void _starpu_mic_sink_deinit(struct _starpu_mp_node *node)
 
 	//free(node->sink_sink_dt_connections);
 
-	scif_close(node->host_sink_dt_connection.mic_endpoint);
-	scif_close(node->mp_connection.mic_endpoint);
 }
 
 /* Report an error which occured when using a MIC device
  * and print this error in a human-readable style
  */
-
 void _starpu_mic_sink_report_error(const char *func, const char *file, const int line, const int status)
 {
 	const char *errormsg = strerror(status);
@@ -84,13 +140,6 @@ void _starpu_mic_sink_report_error(const char *func, const char *file, const int
 	STARPU_ASSERT(0);
 }
 
-/* Return the number of cores on the callee, a MIC device or Processor Xeon
- */
-unsigned int _starpu_mic_sink_get_nb_core(void)
-{
-	return (unsigned int) COISysGetCoreCount();
-}
-
 /* Allocate memory on the MIC.
  * Memory is register for remote direct access. */
 void _starpu_mic_sink_allocate(const struct _starpu_mp_node *mp_node, void *arg, int arg_size)
@@ -133,3 +182,28 @@ void _starpu_mic_sink_free(const struct _starpu_mp_node *mp_node STARPU_ATTRIBUT
 #endif
 	free(addr);
 }
+
+
+/* bind the thread to a core
+ */
+void _starpu_mic_sink_bind_thread(const struct _starpu_mp_node *mp_node STARPU_ATTRIBUTE_UNUSED, int coreid, int * core_table, int nb_core)
+{
+	cpu_set_t cpuset;
+	int i;
+
+  	//init the set
+	CPU_ZERO(&cpuset);
+
+	//adding the core to the set
+	for(i=0;i<nb_core;i++)
+		CPU_SET(core_table[i],&cpuset);
+
+	pthread_setaffinity_np(((pthread_t*)mp_node->thread_table)[coreid],sizeof(cpu_set_t),&cpuset);
+}
+
+void (*_starpu_mic_sink_lookup (const struct _starpu_mp_node * node STARPU_ATTRIBUTE_UNUSED, char* func_name))(void)
+{
+	void *dl_handle = dlopen(NULL, RTLD_NOW);
+	return dlsym(dl_handle, func_name);
+}
+

+ 5 - 3
src/drivers/mic/driver_mic_sink.h

@@ -34,13 +34,15 @@
 void _starpu_mic_sink_report_error(const char *func, const char *file, const int line, const int status);
 
 void _starpu_mic_sink_init(struct _starpu_mp_node *node);
-
+void _starpu_mic_sink_launch_workers(struct _starpu_mp_node *node);
 void _starpu_mic_sink_deinit(struct _starpu_mp_node *node);
 
-unsigned int _starpu_mic_sink_get_nb_core(void);
-
 void _starpu_mic_sink_allocate(const struct _starpu_mp_node *mp_node, void *arg, int arg_size);
 void _starpu_mic_sink_free(const struct _starpu_mp_node *mp_node STARPU_ATTRIBUTE_UNUSED, void *arg, int arg_size);
+void _starpu_mic_sink_bind_thread(const struct _starpu_mp_node *mp_node STARPU_ATTRIBUTE_UNUSED, int coreid, int * core_table, int nb_core);
+
+void (*_starpu_mic_sink_lookup (const struct _starpu_mp_node * node STARPU_ATTRIBUTE_UNUSED,
+			char* func_name))(void);
 
 #endif /* STARPU_USE_MIC */
 

+ 50 - 215
src/drivers/mic/driver_mic_source.c

@@ -73,7 +73,7 @@ starpu_pthread_mutex_t nb_mic_worker_init_mutex = PTHREAD_MUTEX_INITIALIZER;
 //	return config->workers[workerid].devid;
 //}
 
-const struct _starpu_mp_node *_starpu_mic_src_get_actual_thread_mp_node()
+struct _starpu_mp_node *_starpu_mic_src_get_actual_thread_mp_node()
 {
 	struct _starpu_worker *actual_worker = _starpu_get_local_worker_key();
 	STARPU_ASSERT(actual_worker);
@@ -110,96 +110,6 @@ void _starpu_mic_clear_kernels(void)
 	}
 }
 
-static int
-_starpu_mic_src_finalize_job (struct _starpu_job *j, struct _starpu_worker *worker)
-{
-	uint32_t mask = 0;
-	int profiling = starpu_profiling_status_get();
-	struct timespec codelet_end;
-
-	_starpu_driver_end_job(worker, j, worker->perf_arch, &codelet_end, 0,
-			       profiling);
-
-	_starpu_driver_update_job_feedback(j, worker, worker->perf_arch,
-					   &j->cl_start, &codelet_end,
-					   profiling);
-
-	_starpu_push_task_output (j, mask);
-
-	_starpu_handle_job_termination(j);
-
-	return 0;
-}
-
-static int
-_starpu_mic_src_process_completed_job (struct _starpu_worker_set *workerset)
-{
-	struct _starpu_mp_node *node = mic_nodes[workerset->workers[0].mp_nodeid];
-	enum _starpu_mp_command answer;
-	void *arg;
-	int arg_size;
-
-	answer = _starpu_mp_common_recv_command (node, &arg, &arg_size);
-	STARPU_ASSERT (answer == STARPU_EXECUTION_COMPLETED);
-
-	void *arg_ptr = arg;
-	int coreid;
-
-	coreid = *(int *) arg_ptr;
-	arg_ptr += sizeof (coreid); // Useless.
-
-	struct _starpu_worker *worker = &workerset->workers[coreid];
-	struct starpu_task *task = worker->current_task;
-	struct _starpu_job *j = _starpu_get_job_associated_to_task (task);
-
-	_starpu_mic_src_finalize_job (j, worker);
-
-	worker->current_task = NULL;
-
-	return 0;
-}
-
-
-static int _starpu_mic_src_execute_job(struct _starpu_job *j, struct _starpu_worker *args)
-{
-	int ret;
-	uint32_t mask = 0;
-
-	STARPU_ASSERT(j);
-	struct starpu_task *task = j->task;
-
-	//struct timespec codelet_end;
-
-	int profiling = starpu_profiling_status_get();
-	unsigned calibrate_model = 0;
-
-	STARPU_ASSERT(task);
-	struct starpu_codelet *cl = task->cl;
-	STARPU_ASSERT(cl);
-
-	if (cl->model && cl->model->benchmarking)
-		calibrate_model = 1;
-
-	ret = _starpu_fetch_task_input(j, mask);
-	if (ret != 0)
-	{
-		/* there was not enough memory, so the input of
-		 * the codelet cannot be fetched ... put the
-		 * codelet back, and try it later */
-		return -EAGAIN;
-	}
-
-
-	starpu_mic_kernel_t kernel = _starpu_mic_src_get_kernel_from_codelet(j->task->cl, j->nimpl);
-
-	_starpu_driver_start_job (args, j, &j->cl_start, 0, profiling);
-
-	_starpu_src_common_execute_kernel_from_task(mic_nodes[args->mp_nodeid],
-						    (void (*)(void)) kernel, args->devid, task);
-
-	return 0;
-}
-
 int _starpu_mic_src_register_kernel(starpu_mic_func_symbol_t *symbol, const char *func_name)
 {
 	unsigned int func_name_size = (strlen(func_name) + 1) * sizeof(char);
@@ -248,9 +158,11 @@ int _starpu_mic_src_register_kernel(starpu_mic_func_symbol_t *symbol, const char
 	return 0;
 }
 
+
 starpu_mic_kernel_t _starpu_mic_src_get_kernel(starpu_mic_func_symbol_t symbol)
 {
 	int workerid = starpu_worker_get_id();
+	
 	/* This function has to be called in the codelet only, by the thread
 	 * which will handle the task */
 	if (workerid < 0)
@@ -365,6 +277,43 @@ starpu_mic_kernel_t _starpu_mic_src_get_kernel_from_codelet(struct starpu_codele
 	return kernel;
 }
 
+
+
+void(* _starpu_mic_src_get_kernel_from_job(const struct _starpu_mp_node *node STARPU_ATTRIBUTE_UNUSED, struct _starpu_job *j))(void)
+{
+	starpu_mic_kernel_t kernel = NULL;
+
+	starpu_mic_func_t func = _starpu_task_get_mic_nth_implementation(j->task->cl, j->nimpl);
+	if (func)
+	{
+		/* We execute the function contained in the codelet, it must return a
+		 * pointer to the function to execute on the device, either specified
+		 * directly by the user or by a call to starpu_mic_get_func().
+		 */
+		kernel = func();
+	}
+	else
+	{
+		/* If user dont define any starpu_mic_fun_t in cl->mic_func we try to use
+		 * cpu_func_name.
+		 */
+		char *func_name = _starpu_task_get_cpu_name_nth_implementation(j->task->cl, j->nimpl);
+		if (func_name)
+		{
+			starpu_mic_func_symbol_t symbol;
+
+			_starpu_mic_src_register_kernel(&symbol, func_name);
+
+			kernel = _starpu_mic_src_get_kernel(symbol);
+		}
+	}
+	STARPU_ASSERT(kernel);
+
+	return (void (*)(void))kernel;
+}
+
+
+
 /* Initialize the node structure describing the MIC source.
  */
 void _starpu_mic_src_init(struct _starpu_mp_node *node)
@@ -553,18 +502,20 @@ int _starpu_mic_request_is_complete(struct _starpu_mic_async_event *event)
 	return 1;
 }
 
+
+
 void *_starpu_mic_src_worker(void *arg)
 {
-	struct _starpu_worker_set *args = arg;
+	struct _starpu_worker_set *worker_set = arg;
 	/* As all workers of a set share common data, we just use the first
 	 * one for intializing the following stuffs. */
-	struct _starpu_worker *baseworker = &args->workers[0];
+	struct _starpu_worker *baseworker = &worker_set->workers[0];
 	struct _starpu_machine_config *config = baseworker->config;
 	unsigned baseworkerid = baseworker - config->workers;
 	unsigned mp_nodeid = baseworker->mp_nodeid;
 	unsigned i;
 
-	unsigned memnode = baseworker->memory_node;
+	/* unsigned memnode = baseworker->memory_node; */
 
 	_starpu_worker_init(baseworker, _STARPU_FUT_MIC_KEY);
 
@@ -582,131 +533,15 @@ void *_starpu_mic_src_worker(void *arg)
 	_STARPU_TRACE_WORKER_INIT_END;
 
 	/* tell the main thread that this one is ready */
-	STARPU_PTHREAD_MUTEX_LOCK(&args->mutex);
-	args->set_is_initialized = 1;
-	STARPU_PTHREAD_COND_SIGNAL(&args->ready_cond);
-	STARPU_PTHREAD_MUTEX_UNLOCK(&args->mutex);
-
+	STARPU_PTHREAD_MUTEX_LOCK(&worker_set->mutex);
+	worker_set->set_is_initialized = 1;
+	STARPU_PTHREAD_COND_SIGNAL(&worker_set->ready_cond);
+	STARPU_PTHREAD_MUTEX_UNLOCK(&worker_set->mutex);
 
-	while (_starpu_machine_is_running())
-	{
-		int res;
-		struct starpu_task *task = NULL;
-		struct _starpu_job * j;
-		unsigned micworkerid = 0;
-
-		_STARPU_TRACE_START_PROGRESS(memnode);
-		_starpu_datawizard_progress(memnode, 1);
-		_STARPU_TRACE_END_PROGRESS(memnode);
-
-		STARPU_PTHREAD_MUTEX_LOCK(&baseworker->sched_mutex);
-
-		/* We pop tasklists of each worker in the set and process the
-		 * first non-empty list. */
-		for (micworkerid = 0 ; (micworkerid < args->nworkers) && (task == NULL); micworkerid++)
-		    task = _starpu_pop_task (&args->workers[micworkerid]);
-
-		if (task != NULL) {
-			micworkerid--;
-			goto task_found;
-		}
-
-#if 0 // XXX: synchronous execution for now
-		/* No task to submit, so we can poll the MIC device for
-		 * completed jobs. */
-		struct pollfd fd = {
-		    .fd = mic_nodes[baseworker->mp_nodeid]->mp_connection.mic_endpoint,
-		    .events = POLLIN
-		};
-
-		if (0 < poll (&fd, 1, 0)) {
-		    _starpu_mic_src_process_completed_job (args);
-		    goto restart_loop;
-		}
-#endif
-
-		/* At this point, there is really nothing to do for the thread
-		 * so we can block.
-		 * XXX: blocking drivers is in fact broken. DO NOT USE IT ! */
-		if (_starpu_worker_get_status(baseworkerid) != STATUS_SLEEPING)
-		{
-			_STARPU_TRACE_WORKER_SLEEP_START;
-			_starpu_worker_restart_sleeping(baseworkerid);
-			_starpu_worker_set_status(baseworkerid, STATUS_SLEEPING);
-		}
-
-		if (_starpu_worker_can_block(memnode))
-			STARPU_PTHREAD_COND_WAIT(&baseworker->sched_cond, &baseworker->sched_mutex);
-		else
-		{
-			if (_starpu_machine_is_running())
-				STARPU_UYIELD();
-		}
-
-		if (_starpu_worker_get_status(baseworkerid) == STATUS_SLEEPING)
-		{
-			_STARPU_TRACE_WORKER_SLEEP_END;
-			_starpu_worker_stop_sleeping(baseworkerid);
-			_starpu_worker_set_status(baseworkerid, STATUS_UNKNOWN);
-		}
-
-	restart_loop:
-		STARPU_PTHREAD_MUTEX_UNLOCK(&baseworker->sched_mutex);
-		continue;
-
-	task_found:
-		/* If the MIC core associated to `micworkerid' is already
-		 * processing a job, we push back this one in the worker task
-		 * list. */
-		STARPU_PTHREAD_MUTEX_UNLOCK(&baseworker->sched_mutex);
-
-		if (args->workers[micworkerid].current_task) {
-		    _starpu_push_task_to_workers(task);
-		    continue;
-		}
-
-		STARPU_ASSERT(task);
-		j = _starpu_get_job_associated_to_task(task);
-
-		/* can a MIC device do that task ? */
-		if (!_STARPU_MIC_MAY_PERFORM(j))
-		{
-			/* this isn't a mic task */
-			_starpu_push_task_to_workers(task);
-			continue;
-		}
-
-		args->workers[micworkerid].current_task = j->task;
-
-		res = _starpu_mic_src_execute_job (j, &args->workers[micworkerid]);
-
-		if (res)
-		{
-			switch (res)
-			{
-				case -EAGAIN:
-					_STARPU_DISP("ouch, Xeon Phi could not actually run task %p, putting it back...\n", task);
-					_starpu_push_task_to_workers(task);
-					STARPU_ABORT();
-					continue;
-				default:
-					STARPU_ASSERT(0);
-			}
-		}
-
-		/* XXX: synchronous execution for now */
-		_starpu_mic_src_process_completed_job (args);
-	}
+	_starpu_src_common_worker(worker_set, baseworkerid, mic_nodes[mp_nodeid]);
 
 	_STARPU_TRACE_WORKER_DEINIT_START;
 
-	_starpu_handle_all_pending_node_data_requests(memnode);
-
-	/* In case there remains some memory that was automatically
-	 * allocated by StarPU, we release it now. Note that data
-	 * coherency is not maintained anymore at that point ! */
-	_starpu_free_all_automatically_allocated_buffers(memnode);
-
 	_STARPU_TRACE_WORKER_DEINIT_END(_STARPU_FUT_CUDA_KEY);
 
 	return NULL;

+ 3 - 1
src/drivers/mic/driver_mic_source.h

@@ -24,6 +24,7 @@
 
 #include <source/COIProcess_source.h>
 #include <source/COIEngine_source.h>
+#include <core/workers.h>
 
 #include <drivers/mp_common/mp_common.h>
 
@@ -41,9 +42,10 @@ struct _starpu_mic_async_event *event;
 #define STARPU_MIC_SRC_REPORT_SCIF_ERROR(status) \
 	_starpu_mic_src_report_scif_error(__starpu_func__, __FILE__, __LINE__, status)
 
-const struct _starpu_mp_node *_starpu_mic_src_get_actual_thread_mp_node();
+struct _starpu_mp_node *_starpu_mic_src_get_actual_thread_mp_node();
 const struct _starpu_mp_node *_starpu_mic_src_get_mp_node_from_memory_node(int memory_node);
 
+void(* _starpu_mic_src_get_kernel_from_job(const struct _starpu_mp_node *node STARPU_ATTRIBUTE_UNUSED, struct _starpu_job *j))(void);
 int _starpu_mic_src_register_kernel(starpu_mic_func_symbol_t *symbol, const char *func_name);
 starpu_mic_kernel_t _starpu_mic_src_get_kernel(starpu_mic_func_symbol_t symbol);
 

+ 155 - 101
src/drivers/mp_common/mp_common.c

@@ -27,12 +27,14 @@
 #include <drivers/scc/driver_scc_source.h>
 #include <drivers/scc/driver_scc_sink.h>
 
+#include <common/list.h>
+
 /* Allocate and initialize the sink structure, when the function returns
  * all the pointer of functions are linked to the right ones.
  */
 struct _starpu_mp_node * STARPU_ATTRIBUTE_MALLOC
-    _starpu_mp_common_node_create(enum _starpu_mp_node_kind node_kind,
-				  int peer_id)
+_starpu_mp_common_node_create(enum _starpu_mp_node_kind node_kind,
+			      int peer_id)
 {
 	struct _starpu_mp_node *node;
 
@@ -45,111 +47,120 @@ struct _starpu_mp_node * STARPU_ATTRIBUTE_MALLOC
 	switch(node->kind)
 	{
 #ifdef STARPU_USE_MIC
-		case STARPU_MIC_SOURCE:
-			{
-				node->nb_mp_sinks = starpu_mic_worker_get_count();
-				node->devid = peer_id;
-
-				node->init = _starpu_mic_src_init;
-				node->deinit = _starpu_mic_src_deinit;
-				node->report_error = _starpu_mic_src_report_scif_error;
-
-				node->mp_send = _starpu_mic_common_send;
-				node->mp_recv = _starpu_mic_common_recv;
-				node->dt_send = _starpu_mic_common_dt_send;
-				node->dt_recv = _starpu_mic_common_dt_recv;
-
-				node->execute = NULL;
-				node->nbcores = NULL;
-				node->allocate = NULL;
-				node->free = NULL;
-
-				/* A source node is only working on one core,
-				 * there is no need for this function */
-				node->get_nb_core = NULL;
-			}
-			break;
-
-		case STARPU_MIC_SINK:
-			{
-				node->devid = atoi(getenv("DEVID"));;
-				node->nb_mp_sinks = atoi(getenv("NB_MIC"));
-
-				node->init = _starpu_mic_sink_init;
-				node->deinit = _starpu_mic_sink_deinit;
-				node->report_error = _starpu_mic_sink_report_error;
-
-				node->mp_send = _starpu_mic_common_send;
-				node->mp_recv = _starpu_mic_common_recv;
-				node->dt_send = _starpu_mic_common_dt_send;
-				node->dt_recv = _starpu_mic_common_dt_recv;
-
-				node->execute = _starpu_sink_common_execute;
-				node->nbcores = _starpu_sink_nbcores;
-				node->allocate = _starpu_mic_sink_allocate;
-				node->free = _starpu_mic_sink_free;
-
-				node->get_nb_core = _starpu_mic_sink_get_nb_core;
-			}
-			break;
+	case STARPU_MIC_SOURCE:
+	{
+		node->nb_mp_sinks = starpu_mic_worker_get_count();
+		node->devid = peer_id;
+
+		node->init = _starpu_mic_src_init;
+		node->launch_workers= NULL;
+		node->deinit = _starpu_mic_src_deinit;
+		node->report_error = _starpu_mic_src_report_scif_error;
+
+		node->mp_recv_is_ready = _starpu_mic_common_recv_is_ready;
+		node->mp_send = _starpu_mic_common_send;
+		node->mp_recv = _starpu_mic_common_recv;
+		node->dt_send = _starpu_mic_common_dt_send;
+		node->dt_recv = _starpu_mic_common_dt_recv;
+
+		node->get_kernel_from_job =_starpu_mic_src_get_kernel_from_job;
+		node->lookup = NULL;
+		node->bind_thread = NULL;
+		node->execute = NULL;
+		node->allocate = NULL;
+		node->free = NULL;
+	}
+	break;
+
+	case STARPU_MIC_SINK:
+	{
+		node->devid = atoi(getenv("DEVID"));;
+		node->nb_mp_sinks = atoi(getenv("NB_MIC"));
+
+		node->init = _starpu_mic_sink_init;
+		node->launch_workers = _starpu_mic_sink_launch_workers;
+		node->deinit = _starpu_mic_sink_deinit;
+		node->report_error = _starpu_mic_sink_report_error;
+
+		node->mp_recv_is_ready = _starpu_mic_common_recv_is_ready;
+		node->mp_send = _starpu_mic_common_send;
+		node->mp_recv = _starpu_mic_common_recv;
+		node->dt_send = _starpu_mic_common_dt_send;
+		node->dt_recv = _starpu_mic_common_dt_recv;
+
+		node->get_kernel_from_job = NULL;
+		node->lookup = _starpu_mic_sink_lookup;
+		node->bind_thread = _starpu_mic_sink_bind_thread;
+		node->execute = _starpu_sink_common_execute;
+		node->allocate = _starpu_mic_sink_allocate;
+		node->free = _starpu_mic_sink_free;
+
+	}
+	break;
 #endif /* STARPU_USE_MIC */
 
 #ifdef STARPU_USE_SCC
-		case STARPU_SCC_SOURCE:
-			{
-				node->init = _starpu_scc_src_init;
-				node->deinit = NULL;
-				node->report_error = _starpu_scc_common_report_rcce_error;
-
-				node->mp_send = _starpu_scc_common_send;
-				node->mp_recv = _starpu_scc_common_recv;
-				node->dt_send = _starpu_scc_common_send;
-				node->dt_recv = _starpu_scc_common_recv;
-				node->dt_send_to_device = NULL;
-				node->dt_recv_from_device = NULL;
-
-				node->execute = NULL;
-				node->allocate = NULL;
-				node->free = NULL;
-
-				node->get_nb_core = NULL;
-			}
-			break;
-
-		case STARPU_SCC_SINK:
-			{
-				node->init = _starpu_scc_sink_init;
-				node->deinit = _starpu_scc_sink_deinit;
-				node->report_error = _starpu_scc_common_report_rcce_error;
-
-				node->mp_send = _starpu_scc_common_send;
-				node->mp_recv = _starpu_scc_common_recv;
-				node->dt_send = _starpu_scc_common_send;
-				node->dt_recv = _starpu_scc_common_recv;
-				node->dt_send_to_device = _starpu_scc_sink_send_to_device;
-				node->dt_recv_from_device = _starpu_scc_sink_recv_from_device;
-
-				node->execute = _starpu_scc_sink_execute;
-				node->allocate = _starpu_sink_common_allocate;
-				node->free = _starpu_sink_common_free;
-
-				node->get_nb_core = NULL;
-			}
-			break;
+	case STARPU_SCC_SOURCE:
+	{
+		node->init = _starpu_scc_src_init;
+		node->deinit = NULL;
+		node->deinit = NULL;
+		node->report_error = _starpu_scc_common_report_rcce_error;
+				
+		node->mp_recv_is_ready = _starpu_scc_common_recv_is_ready;
+		node->mp_send = _starpu_scc_common_send;
+		node->mp_recv = _starpu_scc_common_recv;
+		node->dt_send = _starpu_scc_common_send;
+		node->dt_recv = _starpu_scc_common_recv;
+		node->dt_send_to_device = NULL;
+		node->dt_recv_from_device = NULL;
+
+		node->get_kernel_from_job =_starpu_scc_src_get_kernel_from_job;
+		node->lookup = NULL;
+		node->bind_thread = NULL;
+		node->execute = NULL;
+		node->allocate = NULL;
+		node->free = NULL;
+	}
+	break;
+
+	case STARPU_SCC_SINK:
+	{
+		node->init = _starpu_scc_sink_init;
+		node->launch_workers = _starpu_scc_sink_launch_workers;
+		node->deinit = _starpu_scc_sink_deinit;
+		node->report_error = _starpu_scc_common_report_rcce_error;
+
+		node->mp_recv_is_ready = _starpu_scc_common_recv_is_ready;
+		node->mp_send = _starpu_scc_common_send;
+		node->mp_recv = _starpu_scc_common_recv;
+		node->dt_send = _starpu_scc_common_send;
+		node->dt_recv = _starpu_scc_common_recv;
+		node->dt_send_to_device = _starpu_scc_sink_send_to_device;
+		node->dt_recv_from_device = _starpu_scc_sink_recv_from_device;
+
+		node->get_kernel_from_job = NULL;
+		node->lookup = _starpu_scc_sink_lookup;
+		node->bind_thread = _starpu_scc_sink_bind_thread;
+		node->execute = _starpu_scc_sink_execute;
+		node->allocate = _starpu_sink_common_allocate;
+		node->free = _starpu_sink_common_free;
+	}
+	break;
 #endif /* STARPU_USE_SCC */
 
 #ifdef STARPU_USE_MPI
-		case STARPU_MPI_SOURCE:
-			STARPU_ABORT();
-			break;
+	case STARPU_MPI_SOURCE:
+		STARPU_ABORT();
+		break;
 
-		case STARPU_MPI_SINK:
-			STARPU_ABORT();
-			break;
+	case STARPU_MPI_SINK:
+		STARPU_ABORT();
+		break;
 #endif /* STARPU_USE_MPI */
 
-		default:
-			STARPU_ASSERT(0);
+	default:
+		STARPU_ASSERT(0);
 	}
 
 	/* Let's allocate the buffer, we want it to be big enough to contain
@@ -159,15 +170,60 @@ struct _starpu_mp_node * STARPU_ATTRIBUTE_MALLOC
 	if (node->init)
 		node->init(node);
 
+	node->message_queue = mp_message_list_new();
+	STARPU_PTHREAD_MUTEX_INIT(&node->message_queue_mutex,NULL);
+
+	/* If the node is a sink then we must initialize some field */
+	if(node->kind == STARPU_MIC_SINK || node->kind == STARPU_SCC_SINK)
+	{
+		int i;
+		node->is_running = 1;
+		node->run_table = malloc(sizeof(struct mp_task *)*node->nb_cores);
+		node->sem_run_table = malloc(sizeof(sem_t)*node->nb_cores);
+
+		for(i=0; i<node->nb_cores; i++)
+		{
+			node->run_table[i] = NULL;
+			sem_init(&node->sem_run_table[i],0,0);
+		}
+		node->barrier_list = mp_barrier_list_new();
+		STARPU_PTHREAD_MUTEX_INIT(&node->barrier_mutex,NULL);
+
+		STARPU_PTHREAD_BARRIER_INIT(&node->init_completed_barrier, NULL, node->nb_cores+1);
+
+		node->launch_workers(node);
+	}	
+
+
 	return node;
 }
 
 /* Deinitialize the sink structure and release the structure */
-
 void _starpu_mp_common_node_destroy(struct _starpu_mp_node *node)
 {
 	if (node->deinit)
 		node->deinit(node);
+		
+	mp_message_list_delete(node->message_queue);
+	STARPU_PTHREAD_MUTEX_DESTROY(&node->message_queue_mutex);
+
+	/* If the node is a sink then we must destroy some field */
+	if(node->kind == STARPU_MIC_SINK || node->kind == STARPU_SCC_SINK)
+	{
+		int i;
+		for(i=0; i<node->nb_cores; i++)
+		{
+			sem_destroy(&node->sem_run_table[i]);
+		}
+
+		free(node->run_table);
+		free(node->sem_run_table);
+
+		mp_barrier_list_delete(node->barrier_list);
+
+		STARPU_PTHREAD_MUTEX_DESTROY(&node->barrier_mutex);
+		STARPU_PTHREAD_BARRIER_DESTROY(&node->init_completed_barrier);
+	}
 
 	free(node->buffer);
 
@@ -175,7 +231,6 @@ void _starpu_mp_common_node_destroy(struct _starpu_mp_node *node)
 }
 
 /* Send COMMAND to RECIPIENT, along with ARG if ARG_SIZE is non-zero */
-
 void _starpu_mp_common_send_command(const struct _starpu_mp_node *node,
 				    const enum _starpu_mp_command command,
 				    void *arg, int arg_size)
@@ -202,7 +257,6 @@ void _starpu_mp_common_send_command(const struct _starpu_mp_node *node,
  * However, the data pointed by arg shouldn't be relied on after a new call to
  * STARPU_MP_COMMON_RECV_COMMAND as it might corrupt it.
  */
-
 enum _starpu_mp_command _starpu_mp_common_recv_command(const struct _starpu_mp_node *node,
 						       void **arg, int *arg_size)
 {

+ 88 - 29
src/drivers/mp_common/mp_common.h

@@ -18,10 +18,14 @@
 #define __MP_COMMON_H__
 
 #include <pthread.h>
+#include <semaphore.h>
 
 #include <starpu.h>
 #include <common/config.h>
-
+#include <common/list.h>
+#include <common/barrier.h>
+#include <common/thread.h>
+#include <datawizard/interfaces/data_interface.h>
 
 #ifdef STARPU_USE_MP
 
@@ -34,31 +38,31 @@
 #define STARPU_MP_SRC_NODE 0
 #define STARPU_MP_SINK_NODE(a) ((a) + 1)
 
-#define STARPU_MP_COMMON_REPORT_ERROR(node, status) \
+#define STARPU_MP_COMMON_REPORT_ERROR(node, status)			\
 	(node)->report_error(__starpu_func__, __FILE__, __LINE__, (status))
-
-
 enum _starpu_mp_command
 {
-	STARPU_EXIT = 0x00,
-	STARPU_EXECUTE = 0x01,
-	STARPU_ERROR_EXECUTE = 0x02,
-	STARPU_LOOKUP = 0X03,
-	STARPU_ANSWER_LOOKUP = 0X04,
-	STARPU_ERROR_LOOKUP = 0X05,
-	STARPU_ALLOCATE = 0x06,
-	STARPU_ANSWER_ALLOCATE = 0x07,
-	STARPU_ERROR_ALLOCATE = 0x08,
-	STARPU_FREE = 0x09,
-	STARPU_RECV_FROM_HOST = 0x10,
-	STARPU_SEND_TO_HOST = 0x11,
-	STARPU_RECV_FROM_SINK = 0x12,
-	STARPU_SEND_TO_SINK = 0x13,
-	STARPU_TRANSFER_COMPLETE = 0x14,
-	STARPU_SINK_NBCORES = 0x15,
-	STARPU_ANSWER_SINK_NBCORES = 0x16,
-	STARPU_EXECUTION_SUBMITTED = 0x17,
-	STARPU_EXECUTION_COMPLETED = 0x18
+	STARPU_EXIT,
+	STARPU_EXECUTE,
+	STARPU_ERROR_EXECUTE,
+	STARPU_LOOKUP,
+	STARPU_ANSWER_LOOKUP,
+	STARPU_ERROR_LOOKUP,
+	STARPU_ALLOCATE,
+	STARPU_ANSWER_ALLOCATE,
+	STARPU_ERROR_ALLOCATE,
+	STARPU_FREE,
+	STARPU_RECV_FROM_HOST,
+	STARPU_SEND_TO_HOST,
+	STARPU_RECV_FROM_SINK,
+	STARPU_SEND_TO_SINK,
+	STARPU_TRANSFER_COMPLETE,
+	STARPU_SINK_NBCORES,
+	STARPU_ANSWER_SINK_NBCORES,
+	STARPU_EXECUTION_SUBMITTED,
+	STARPU_EXECUTION_COMPLETED,
+	STARPU_PRE_EXECUTION,
+	STARPU_SYNC_WORKERS,
 };
 
 enum _starpu_mp_node_kind
@@ -96,12 +100,47 @@ struct _starpu_mp_transfer_command_to_device
 	void *addr;
 };
 
+LIST_TYPE(mp_barrier,
+		int id;
+		_starpu_pthread_barrier_t before_work_barrier;
+		_starpu_pthread_barrier_t after_work_barrier;
+	 );
+
+LIST_TYPE(mp_message,
+		enum _starpu_mp_command type;
+		char buffer[BUFFER_SIZE];
+		int size;
+	 );
+
+struct mp_task 
+{
+	void (*kernel)(void **, void *);
+	void * interfaces[STARPU_NMAXBUFS]; 
+	unsigned nb_interfaces;
+	void *cl_arg;
+	unsigned coreid;
+	enum starpu_codelet_type type;
+	int is_parallel_task;
+	int combined_workerid;
+ 	struct mp_barrier* mp_barrier;
+};
+
+
 /* Message-passing working node, whether source
  * or sink */
 struct _starpu_mp_node
 {
 	enum _starpu_mp_node_kind kind;
 
+	int baseworkerid;
+
+	/*the number of core on the device
+	 * Must be initialized during init function*/
+	int nb_cores;
+
+	/*Is starpu running*/
+	int is_running;
+
 	/* Buffer used for scif data transfers, allocated
 	 * during node initialization.
 	 * Size : BUFFER_SIZE */
@@ -117,7 +156,7 @@ struct _starpu_mp_node
 	int devid;
 
 	/* Only MIC use this for now !!
-	*  Is the number ok MIC on the system. */
+	 *  Is the number ok MIC on the system. */
 	unsigned int nb_mp_sinks;
 
 	/* Connection used for command passing between the host thread and the
@@ -138,12 +177,32 @@ struct _starpu_mp_node
 	 *  - sink_sink_dt_connections[j] is not initialized for the sink number j. */
 	union _starpu_mp_connection *sink_sink_dt_connections;
 
+	/* */
+	_starpu_pthread_barrier_t init_completed_barrier; 
+	
+	/* table to store pointer of the thread workers*/
+	void* thread_table;
+
+        /*list where threads add messages to send to the source node */
+        struct mp_message_list* message_queue;
+	starpu_pthread_mutex_t message_queue_mutex;
+
+	/*list of barrier for combined worker*/
+	struct mp_barrier_list* barrier_list;
+	starpu_pthread_mutex_t barrier_mutex;
+
+	/*table where worker comme pick task*/
+	struct mp_task ** run_table;
+	sem_t * sem_run_table;
+
 	/* Node general functions */
 	void (*init)(struct _starpu_mp_node *node);
+	void (*launch_workers)(struct _starpu_mp_node *node);
 	void (*deinit)(struct _starpu_mp_node *node);
 	void (*report_error)(const char *, const char *, const int, const int);
 
 	/* Message passing */
+	int (*mp_recv_is_ready)(const struct _starpu_mp_node *);
 	void (*mp_send)(const struct _starpu_mp_node *, void *, int);
 	void (*mp_recv)(const struct _starpu_mp_node *, void *, int);
 
@@ -153,12 +212,12 @@ struct _starpu_mp_node
 	void (*dt_send_to_device)(const struct _starpu_mp_node *, int, void *, int);
 	void (*dt_recv_from_device)(const struct _starpu_mp_node *, int, void *, int);
 
-	void (*execute)(const struct _starpu_mp_node *, void *, int);
-	void (*nbcores)(const struct _starpu_mp_node *);
+	void (*(*get_kernel_from_job)(const struct _starpu_mp_node *,struct _starpu_job *))(void);
+	void (*(*lookup)(const struct _starpu_mp_node *, char* ))(void);
+	void (*bind_thread)(const struct _starpu_mp_node *, int,int *,int);
+	void (*execute)(struct _starpu_mp_node *, void *, int);
 	void (*allocate)(const struct _starpu_mp_node *, void *, int);
 	void (*free)(const struct _starpu_mp_node *, void *, int);
-
-	unsigned int (*get_nb_core)(void);
 };
 
 struct _starpu_mp_node * _starpu_mp_common_node_create(enum _starpu_mp_node_kind node_kind, int peer_devid);
@@ -170,7 +229,7 @@ void _starpu_mp_common_send_command(const struct _starpu_mp_node *node,
 				    void *arg, int arg_size);
 
 enum _starpu_mp_command _starpu_mp_common_recv_command(const struct _starpu_mp_node *node,
-						    void **arg, int *arg_size);
+						       void **arg, int *arg_size);
 
 
 #endif /* STARPU_USE_MP */

+ 487 - 142
src/drivers/mp_common/sink_common.c

@@ -15,20 +15,21 @@
  */
 
 
-#include <dlfcn.h>
-
 #include <starpu.h>
 #include <common/config.h>
 #include <common/utils.h>
 #include <drivers/mp_common/mp_common.h>
 #include <datawizard/interfaces/data_interface.h>
-
+#include <common/barrier.h>
+#include <core/workers.h>
+#include <common/barrier_counter.h>
 #ifdef STARPU_USE_MIC
 #include <common/COISysInfo_common.h>
 #endif
 
 #include "sink_common.h"
 
+
 /* Return the sink kind of the running process, based on the value of the
  * STARPU_SINK environment variable.
  * If there is no valid value retrieved, return STARPU_INVALID_KIND
@@ -50,90 +51,25 @@ static enum _starpu_mp_node_kind _starpu_sink_common_get_kind(void)
 		return STARPU_INVALID_KIND;
 }
 
-void
-_starpu_sink_nbcores (const struct _starpu_mp_node *node)
-{
-    // Process packet received from `_starpu_src_common_sink_cores'.
-    int nbcores = 1;
-
-#ifdef STARPU_USE_MIC
-    // XXX I currently only support MIC for now.
-    if (STARPU_MIC_SINK == _starpu_sink_common_get_kind ())
-	nbcores = COISysGetCoreCount();
-#endif
 
-    _starpu_mp_common_send_command (node, STARPU_ANSWER_SINK_NBCORES,
-				    &nbcores, sizeof (int));
-}
-
-
-/* Receive paquet from _starpu_src_common_execute_kernel in the form below :
- * [Function pointer on sink, number of interfaces, interfaces
- * (union _starpu_interface), cl_arg]
- * Then call the function given, passing as argument an array containing the
- * addresses of the received interfaces
+/* Send to host the number of cores of the sink device
  */
-void _starpu_sink_common_execute(const struct _starpu_mp_node *node,
-					void *arg, int arg_size)
+static void _starpu_sink_common_get_nb_cores (struct _starpu_mp_node *node)
 {
-	unsigned id = 0;
-
-	void *arg_ptr = arg;
-	void (*kernel)(void **, void *) = NULL;
-	unsigned coreid = 0;
-	unsigned nb_interfaces = 0;
-	void *interfaces[STARPU_NMAXBUFS];
-	void *cl_arg;
-
-	kernel = *(void(**)(void **, void *)) arg_ptr;
-	arg_ptr += sizeof(kernel);
-
-	coreid = *(unsigned *) arg_ptr;
-	arg_ptr += sizeof(coreid);
-
-	nb_interfaces = *(unsigned *) arg_ptr;
-	arg_ptr += sizeof(nb_interfaces);
-
-	/* The function needs an array pointing to each interface it needs
-	 * during execution. As in sink-side there is no mean to know which
-	 * kind of interface to expect, the array is composed of unions of
-	 * interfaces, thus we expect the same size anyway */
-	for (id = 0; id < nb_interfaces; id++)
-	{
-		interfaces[id] = arg_ptr;
-		arg_ptr += sizeof(union _starpu_interface);
-	}
-
-	/* Was cl_arg sent ? */
-	if (arg_size > arg_ptr - arg)
-		cl_arg = arg_ptr;
-	else
-		cl_arg = NULL;
-
-	//_STARPU_DEBUG("telling host that we have submitted the task %p.\n", kernel);
-	/* XXX: in the future, we will not have to directly execute the kernel
-	 * but submit it to the correct local worker. */
-	_starpu_mp_common_send_command(node, STARPU_EXECUTION_SUBMITTED,
-				       NULL, 0);
-
-	//_STARPU_DEBUG("executing the task %p\n", kernel);
-	/* XXX: we keep the synchronous execution model on the sink side for
-	 * now. */
-	kernel(interfaces, cl_arg);
-
-	//_STARPU_DEBUG("telling host that we have finished the task %p.\n", kernel);
-	_starpu_mp_common_send_command(node, STARPU_EXECUTION_COMPLETED,
-				       &coreid, sizeof(coreid));
+	// Process packet received from `_starpu_src_common_sink_cores'.
+     	_starpu_mp_common_send_command (node, STARPU_ANSWER_SINK_NBCORES,
+					&node->nb_cores, sizeof (int));
 }
 
 
+/* Send to host the address of the function given in parameter
+ */
 static void _starpu_sink_common_lookup(const struct _starpu_mp_node *node,
 				       char *func_name)
 {
 	void (*func)(void);
-	void *dl_handle = dlopen(NULL, RTLD_NOW);
-	func = dlsym(dl_handle, func_name);
-
+	func = node->lookup(node,func_name);
+	
 	//_STARPU_DEBUG("Looked up %s, got %p\n", func_name, func);
 
 	/* If we couldn't find the function, let's send an error to the host.
@@ -146,21 +82,24 @@ static void _starpu_sink_common_lookup(const struct _starpu_mp_node *node,
 					       NULL, 0);
 }
 
+
+/* Allocate a memory space and send the address of this space to the host
+ */
 void _starpu_sink_common_allocate(const struct _starpu_mp_node *mp_node,
 				  void *arg, int arg_size)
 {
-    STARPU_ASSERT(arg_size == sizeof(size_t));
-
-    void *addr = malloc(*(size_t *)(arg));
-
-    /* If the allocation fail, let's send an error to the host.
-     */
-    if (addr)
-	_starpu_mp_common_send_command(mp_node, STARPU_ANSWER_ALLOCATE,
-				       &addr, sizeof(addr));
-    else
-	_starpu_mp_common_send_command(mp_node, STARPU_ERROR_ALLOCATE,
-				       NULL, 0);
+	STARPU_ASSERT(arg_size == sizeof(size_t));
+
+	void *addr = malloc(*(size_t *)(arg));
+
+	/* If the allocation fail, let's send an error to the host.
+	 */
+	if (addr)
+		_starpu_mp_common_send_command(mp_node, STARPU_ANSWER_ALLOCATE,
+					       &addr, sizeof(addr));
+	else
+		_starpu_mp_common_send_command(mp_node, STARPU_ERROR_ALLOCATE,
+					       NULL, 0);
 }
 
 void _starpu_sink_common_free(const struct _starpu_mp_node *mp_node STARPU_ATTRIBUTE_UNUSED,
@@ -174,56 +113,129 @@ void _starpu_sink_common_free(const struct _starpu_mp_node *mp_node STARPU_ATTRI
 static void _starpu_sink_common_copy_from_host(const struct _starpu_mp_node *mp_node,
 					       void *arg, int arg_size)
 {
-    STARPU_ASSERT(arg_size == sizeof(struct _starpu_mp_transfer_command));
+	STARPU_ASSERT(arg_size == sizeof(struct _starpu_mp_transfer_command));
 
-    struct _starpu_mp_transfer_command *cmd = (struct _starpu_mp_transfer_command *)arg;
+	struct _starpu_mp_transfer_command *cmd = (struct _starpu_mp_transfer_command *)arg;
 
-    mp_node->dt_recv(mp_node, cmd->addr, cmd->size);
+	mp_node->dt_recv(mp_node, cmd->addr, cmd->size);
 }
 
 static void _starpu_sink_common_copy_to_host(const struct _starpu_mp_node *mp_node,
 					     void *arg, int arg_size)
 {
-    STARPU_ASSERT(arg_size == sizeof(struct _starpu_mp_transfer_command));
+	STARPU_ASSERT(arg_size == sizeof(struct _starpu_mp_transfer_command));
 
-    struct _starpu_mp_transfer_command *cmd = (struct _starpu_mp_transfer_command *)arg;
+	struct _starpu_mp_transfer_command *cmd = (struct _starpu_mp_transfer_command *)arg;
 
-    mp_node->dt_send(mp_node, cmd->addr, cmd->size);
+	mp_node->dt_send(mp_node, cmd->addr, cmd->size);
 }
 
 static void _starpu_sink_common_copy_from_sink(const struct _starpu_mp_node *mp_node,
 					       void *arg, int arg_size)
 {
-    STARPU_ASSERT(arg_size == sizeof(struct _starpu_mp_transfer_command_to_device));
+	STARPU_ASSERT(arg_size == sizeof(struct _starpu_mp_transfer_command_to_device));
 
-    struct _starpu_mp_transfer_command_to_device *cmd = (struct _starpu_mp_transfer_command_to_device *)arg;
+	struct _starpu_mp_transfer_command_to_device *cmd = (struct _starpu_mp_transfer_command_to_device *)arg;
 
-    mp_node->dt_recv_from_device(mp_node, cmd->devid, cmd->addr, cmd->size);
+	mp_node->dt_recv_from_device(mp_node, cmd->devid, cmd->addr, cmd->size);
 
-    _starpu_mp_common_send_command(mp_node, STARPU_TRANSFER_COMPLETE, NULL, 0);
+	_starpu_mp_common_send_command(mp_node, STARPU_TRANSFER_COMPLETE, NULL, 0);
 }
 
 static void _starpu_sink_common_copy_to_sink(const struct _starpu_mp_node *mp_node,
 					     void *arg, int arg_size)
 {
-    STARPU_ASSERT(arg_size == sizeof(struct _starpu_mp_transfer_command_to_device));
+	STARPU_ASSERT(arg_size == sizeof(struct _starpu_mp_transfer_command_to_device));
+
+	struct _starpu_mp_transfer_command_to_device *cmd = (struct _starpu_mp_transfer_command_to_device *)arg;
+
+	mp_node->dt_send_to_device(mp_node, cmd->devid, cmd->addr, cmd->size);
+}
+
+
+/* Receive workers and combined workers and store them into the struct config
+ */
+static void _starpu_sink_common_recv_workers(struct _starpu_mp_node * node, void *arg, int arg_size)
+{
+	/* Retrieve information from the message */
+	STARPU_ASSERT(arg_size == (sizeof(int)*5));
+	void * arg_ptr = arg;
+	int i;
+	
+	int nworkers = *(int *)arg_ptr; 
+	arg_ptr += sizeof(nworkers);
+
+	int worker_size = *(int *)arg_ptr;
+	arg_ptr += sizeof(worker_size);
+
+	int combined_worker_size = *(int *)arg_ptr;
+	arg_ptr += sizeof(combined_worker_size);
+	
+	int baseworkerid = *(int *)arg_ptr;
+	arg_ptr += sizeof(baseworkerid);
+
+	struct _starpu_machine_config *config = _starpu_get_machine_config();
+	config->topology.nworkers = *(int *)arg_ptr;
+
+
+	/* Retrieve workers */
+	struct _starpu_worker * workers = &config->workers[baseworkerid];
+	node->dt_recv(node,workers,worker_size);
+	
+	/* Update workers to have coherent field */
+	for(i=0; i<nworkers; i++)
+	{
+		workers[i].config = config;
+		starpu_pthread_mutex_init(&workers[i].mutex,NULL);
+		starpu_pthread_mutex_destroy(&workers[i].mutex);
+
+		starpu_pthread_cond_init(&workers[i].started_cond,NULL);
+		starpu_pthread_cond_destroy(&workers[i].started_cond);
+
+		starpu_pthread_cond_init(&workers[i].ready_cond,NULL);
+		starpu_pthread_cond_destroy(&workers[i].ready_cond);
+
+		starpu_pthread_mutex_init(&workers[i].sched_mutex,NULL);
+		starpu_pthread_mutex_destroy(&workers[i].sched_mutex);
+
+		starpu_pthread_cond_init(&workers[i].sched_cond,NULL);
+		starpu_pthread_cond_destroy(&workers[i].sched_cond);
 
-    struct _starpu_mp_transfer_command_to_device *cmd = (struct _starpu_mp_transfer_command_to_device *)arg;
+		workers[i].current_task = NULL;
+		workers[i].set = NULL;
+		workers[i].terminated_jobs = NULL;
+	
+		//_starpu_barrier_counter_init(&workers[i].tasks_barrier, 1);
+		//_starpu_barrier_counter_destroy(&workers[i].tasks_barrier);
 
-    mp_node->dt_send_to_device(mp_node, cmd->devid, cmd->addr, cmd->size);
+		starpu_pthread_mutex_init(&workers[i].parallel_sect_mutex,NULL);
+		starpu_pthread_mutex_destroy(&workers[i].parallel_sect_mutex);
+
+		starpu_pthread_cond_init(&workers[i].parallel_sect_cond,NULL);
+		starpu_pthread_cond_destroy(&workers[i].parallel_sect_cond);
+
+	}
+
+	/* Retrieve combined workers */
+	struct _starpu_combined_worker * combined_workers = config->combined_workers; 
+	node->dt_recv(node, combined_workers, combined_worker_size);
+
+	node->baseworkerid = baseworkerid;
+	STARPU_PTHREAD_BARRIER_WAIT(&node->init_completed_barrier);	
 }
 
+
+
 /* Function looping on the sink, waiting for tasks to execute.
  * If the caller is the host, don't do anything.
  */
-
 void _starpu_sink_common_worker(void)
 {
 	struct _starpu_mp_node *node = NULL;
 	enum _starpu_mp_command command = STARPU_EXIT;
 	int arg_size = 0;
 	void *arg = NULL;
-
+	int exit_starpu = 0;
 	enum _starpu_mp_node_kind node_kind = _starpu_sink_common_get_kind();
 
 	if (node_kind == STARPU_INVALID_KIND)
@@ -234,46 +246,82 @@ void _starpu_sink_common_worker(void)
 	/* Create and initialize the node */
 	node = _starpu_mp_common_node_create(node_kind, -1);
 
-	while ((command = _starpu_mp_common_recv_command(node, &arg, &arg_size)) != STARPU_EXIT)
+	starpu_pthread_key_t worker_key;
+	STARPU_PTHREAD_KEY_CREATE(&worker_key, NULL);
+
+
+	struct _starpu_machine_config *config;
+	while (!exit_starpu)
 	{
-		switch(command)
+		/* If we have received a message */
+		if(node->mp_recv_is_ready(node))
+		{
+
+			command = _starpu_mp_common_recv_command(node, &arg, &arg_size);
+			switch(command)
+			{
+				case STARPU_EXIT:
+					exit_starpu = 1;
+					break;
+				case STARPU_EXECUTE:
+					config = _starpu_get_machine_config();
+					node->execute(node, arg, arg_size);
+					break;
+				case STARPU_SINK_NBCORES:
+					_starpu_sink_common_get_nb_cores(node);
+					break;
+				case STARPU_LOOKUP:
+					_starpu_sink_common_lookup(node, (char *) arg);
+					break;
+
+				case STARPU_ALLOCATE:
+					node->allocate(node, arg, arg_size);
+					break;
+
+				case STARPU_FREE:
+					node->free(node, arg, arg_size);
+					break;
+
+				case STARPU_RECV_FROM_HOST:
+					_starpu_sink_common_copy_from_host(node, arg, arg_size);
+					break;
+
+				case STARPU_SEND_TO_HOST:
+					_starpu_sink_common_copy_to_host(node, arg, arg_size);
+					break;
+
+				case STARPU_RECV_FROM_SINK:
+					_starpu_sink_common_copy_from_sink(node, arg, arg_size);
+					break;
+
+				case STARPU_SEND_TO_SINK:
+					_starpu_sink_common_copy_to_sink(node, arg, arg_size);
+					break;
+
+				case STARPU_SYNC_WORKERS:
+					_starpu_sink_common_recv_workers(node, arg, arg_size);
+					break;
+				default:
+					printf("Oops, command %x unrecognized\n", command);
+			}
+		}
+
+		STARPU_PTHREAD_MUTEX_LOCK(&node->message_queue_mutex);
+		/* If the list is not empty */
+		if(!mp_message_list_empty(node->message_queue))
 		{
-			case STARPU_EXECUTE:
-				node->execute(node, arg, arg_size);
-				break;
-			case STARPU_SINK_NBCORES:
-				node->nbcores (node);
-				break;
-			case STARPU_LOOKUP:
-				_starpu_sink_common_lookup(node, (char *) arg);
-				break;
-
-			case STARPU_ALLOCATE:
-				node->allocate(node, arg, arg_size);
-				break;
-
-			case STARPU_FREE:
-				node->free(node, arg, arg_size);
-				break;
-
-			case STARPU_RECV_FROM_HOST:
-				_starpu_sink_common_copy_from_host(node, arg, arg_size);
-				break;
-
-			case STARPU_SEND_TO_HOST:
-				_starpu_sink_common_copy_to_host(node, arg, arg_size);
-				break;
-
-			case STARPU_RECV_FROM_SINK:
-				_starpu_sink_common_copy_from_sink(node, arg, arg_size);
-				break;
-
-			case STARPU_SEND_TO_SINK:
-				_starpu_sink_common_copy_to_sink(node, arg, arg_size);
-				break;
-
-			default:
-				printf("Oops, command %x unrecognized\n", command);
+			/* We pop a message and send it to the host */
+			struct mp_message * message = mp_message_list_pop_back(node->message_queue);
+			STARPU_PTHREAD_MUTEX_UNLOCK(&node->message_queue_mutex);
+			//_STARPU_DEBUG("telling host that we have finished the task %p sur %d.\n", task->kernel, task->coreid);
+			config = _starpu_get_machine_config();
+			_starpu_mp_common_send_command(node, message->type, 
+					&message->buffer, message->size);
+			mp_message_delete(message);
+		}
+		else
+		{
+			STARPU_PTHREAD_MUTEX_UNLOCK(&node->message_queue_mutex);
 		}
 	}
 
@@ -282,3 +330,300 @@ void _starpu_sink_common_worker(void)
 
 	exit(0);
 }
+
+
+/* Search for the mp_barrier correspondind to the specified combined worker 
+ * and create it if it doesn't exist
+ */
+static struct mp_barrier * _starpu_sink_common_get_barrier(struct _starpu_mp_node * node, int cb_workerid, int cb_workersize)
+{
+	struct mp_barrier * b = NULL;
+	STARPU_PTHREAD_MUTEX_LOCK(&node->barrier_mutex);
+	/* Search if the barrier already exist */
+	for(b = mp_barrier_list_begin(node->barrier_list); 
+			b != mp_barrier_list_end(node->barrier_list) && b->id != cb_workerid; 
+			b = mp_barrier_list_next(b));
+
+	/* If we found the barrier */
+	if(b != NULL)
+	{
+		STARPU_PTHREAD_MUTEX_UNLOCK(&node->barrier_mutex);
+		return b;
+	}
+	else
+	{
+
+		/* Else we create, initialize and add it to the list*/
+		b = mp_barrier_new();
+		b->id = cb_workerid;
+		STARPU_PTHREAD_BARRIER_INIT(&b->before_work_barrier,NULL,cb_workersize);
+		STARPU_PTHREAD_BARRIER_INIT(&b->after_work_barrier,NULL,cb_workersize);
+		mp_barrier_list_push_back(node->barrier_list,b);
+		STARPU_PTHREAD_MUTEX_UNLOCK(&node->barrier_mutex);
+		return b;
+	}
+}
+
+
+/* Erase for the mp_barrier correspondind to the specified combined worker
+*/
+static void _starpu_sink_common_erase_barrier(struct _starpu_mp_node * node, struct mp_barrier *barrier)
+{
+	STARPU_PTHREAD_MUTEX_LOCK(&node->barrier_mutex);
+	mp_barrier_list_erase(node->barrier_list,barrier);
+	STARPU_PTHREAD_MUTEX_UNLOCK(&node->barrier_mutex);
+}
+
+/* Append the message given in parameter to the message list
+ */
+static void _starpu_sink_common_append_message(struct _starpu_mp_node *node, struct mp_message * message)
+{
+	struct _starpu_machine_config *config = _starpu_get_machine_config();
+	STARPU_PTHREAD_MUTEX_LOCK(&node->message_queue_mutex);
+	mp_message_list_push_front(node->message_queue,message);
+	STARPU_PTHREAD_MUTEX_UNLOCK(&node->message_queue_mutex);
+
+}
+/* Append to the message list a "STARPU_PRE_EXECUTION" message
+ */
+static void _starpu_sink_common_pre_execution_message(struct _starpu_mp_node *node, struct mp_task *task)
+{
+	/* Init message to tell the sink that the execution has begun */
+	struct mp_message * message = mp_message_new();
+	message->type = STARPU_PRE_EXECUTION;
+	*(int *) message->buffer = task->combined_workerid;
+	message->size = sizeof(task->combined_workerid);
+
+
+	/* Append the message to the queue */	
+	_starpu_sink_common_append_message(node, message);
+
+}
+
+/* Append to the message list a "STARPU_EXECUTION_COMPLETED" message
+ */
+static void _starpu_sink_common_execution_completed_message(struct _starpu_mp_node *node, struct mp_task *task)
+{
+	/* Init message to tell the sink that the execution is completed */
+	struct mp_message * message = mp_message_new();
+	message->type = STARPU_EXECUTION_COMPLETED;
+	message->size = sizeof(task->coreid);
+	*(int*) message->buffer = task->coreid;
+
+	/* Append the message to the queue */
+	_starpu_sink_common_append_message(node, message);
+}
+
+
+/* Bind the thread which is running on the specified core to the combined worker */
+static void _starpu_sink_common_bind_to_combined_worker(struct _starpu_mp_node *node, int coreid, struct _starpu_combined_worker * combined_worker)
+{
+	int i;
+	int * bind_set = malloc(sizeof(int)*combined_worker->worker_size);
+	for(i=0;i<combined_worker->worker_size;i++)
+		bind_set[i] = combined_worker->combined_workerid[i] - node->baseworkerid;
+	node->bind_thread(node, coreid, bind_set, combined_worker->worker_size);
+}
+
+
+
+/* Get the current rank of the worker in the combined worker 
+ */
+static int _starpu_sink_common_get_current_rank(int workerid, struct _starpu_combined_worker * combined_worker)
+{
+	int i;
+	for(i=0; i<combined_worker->worker_size; i++)
+		if(workerid == combined_worker->combined_workerid[i])
+			return i;
+
+	STARPU_ASSERT(0);
+}
+
+/* Execute the task 
+ */
+static void _starpu_sink_common_execute_kernel(struct _starpu_mp_node *node, int coreid, struct _starpu_worker * worker)
+{
+	struct _starpu_combined_worker * combined_worker = NULL;
+	struct mp_task* task = node->run_table[coreid];
+
+
+	/* If it's a parallel task */
+	if(task->is_parallel_task)
+	{
+		combined_worker = _starpu_get_combined_worker_struct(task->combined_workerid);
+		
+		worker->current_rank = _starpu_sink_common_get_current_rank(worker->workerid, combined_worker);
+		worker->combined_workerid = task->combined_workerid;
+		worker->worker_size = combined_worker->worker_size;
+		
+		/* Synchronize with others threads of the combined worker*/
+		STARPU_PTHREAD_BARRIER_WAIT(&task->mp_barrier->before_work_barrier);
+
+
+		/* The first thread of the combined worker */
+		if(worker->current_rank == 0)
+		{
+			/* tell the sink that the execution has begun */
+			_starpu_sink_common_pre_execution_message(node,task);
+
+			/* If the mode is FORKJOIN, 
+			 * the first thread binds himself 
+			 * on all core of the combined worker*/
+			if(task->type == STARPU_FORKJOIN)
+			{
+				_starpu_sink_common_bind_to_combined_worker(node, coreid, combined_worker);
+			}
+		}
+	}
+	else
+	{
+		worker->current_rank = 0;
+		worker->combined_workerid = 0;
+		worker->worker_size = 1;
+	}
+	if(task->type != STARPU_FORKJOIN || worker->current_rank == 0)
+	{
+		/* execute the task */
+		task->kernel(task->interfaces,task->cl_arg);
+	}
+
+	/* If it's a parallel task */
+	if(task->is_parallel_task)
+	{
+		/* Synchronize with others threads of the combined worker*/
+		STARPU_PTHREAD_BARRIER_WAIT(&task->mp_barrier->after_work_barrier);
+
+		/* The fisrt thread of the combined */
+		if(worker->current_rank == 0)
+		{
+			/* Erase the barrier from the list */
+			_starpu_sink_common_erase_barrier(node,task->mp_barrier);
+
+			/* If the mode is FORKJOIN, 
+			 * the first thread rebinds himself on his own core */
+			if(task->type == STARPU_FORKJOIN)
+				node->bind_thread(node, coreid, &coreid, 1);
+
+		}
+	}
+
+	node->run_table[coreid] = NULL;
+
+	/* tell the sink that the execution is completed */
+	_starpu_sink_common_execution_completed_message(node,task);
+
+	/*free the task*/
+	unsigned i;
+	for (i = 0; i < task->nb_interfaces; i++)
+		free(task->interfaces[i]);
+	free(task);
+
+}
+
+
+/* The main function executed by the thread 
+ * thread_arg is a structure containing the information needed by the thread
+ */
+void* _starpu_sink_thread(void * thread_arg)
+{
+	/* Retrieve the information from the structure */
+	struct _starpu_mp_node *node = ((struct arg_sink_thread *)thread_arg)->node;
+	int coreid =((struct arg_sink_thread *)thread_arg)->coreid;
+	/* free the structure */
+	free(thread_arg);
+
+	STARPU_PTHREAD_BARRIER_WAIT(&node->init_completed_barrier);	
+
+	struct _starpu_worker *worker = &_starpu_get_machine_config()->workers[node->baseworkerid + coreid];
+
+	_starpu_set_local_worker_key(worker);
+	while(node->is_running)
+	{
+		/*Wait there is a task available */
+		sem_wait(&node->sem_run_table[coreid]);
+		if(node->run_table[coreid] != NULL)
+			_starpu_sink_common_execute_kernel(node,coreid,worker);
+
+	}
+	pthread_exit(NULL);
+}
+
+
+/* Add the task to the specific thread and wake him up
+*/
+static void _starpu_sink_common_execute_thread(struct _starpu_mp_node *node, struct mp_task *task)
+{
+	/* Add the task to the specific thread */
+	node->run_table[task->coreid] = task;
+	/* Unlock the mutex to wake up the thread which will execute the task */
+	sem_post(&node->sem_run_table[task->coreid]);
+}
+
+
+
+/* Receive paquet from _starpu_src_common_execute_kernel in the form below :
+ * [Function pointer on sink, number of interfaces, interfaces
+ * (union _starpu_interface), cl_arg]
+ * Then call the function given, passing as argument an array containing the
+ * addresses of the received interfaces
+ */
+
+void _starpu_sink_common_execute(struct _starpu_mp_node *node,
+		void *arg, int arg_size)
+{
+	unsigned i;
+
+	void *arg_ptr = arg;
+	struct mp_task *task = malloc(sizeof(struct mp_task));
+
+	task->kernel = *(void(**)(void **, void *)) arg_ptr;
+	arg_ptr += sizeof(task->kernel);
+
+	task->type = *(enum starpu_codelet_type *) arg_ptr;
+	arg_ptr += sizeof(task->type);
+
+	task->is_parallel_task = *(int *) arg_ptr;
+	arg_ptr += sizeof(task->is_parallel_task);
+
+	if(task->is_parallel_task)
+	{
+		task->combined_workerid= *(int *) arg_ptr;
+		arg_ptr += sizeof(task->combined_workerid);
+
+		task->mp_barrier = _starpu_sink_common_get_barrier(node,task->combined_workerid,_starpu_get_combined_worker_struct(task->combined_workerid)->worker_size);
+	}
+
+	task->coreid = *(unsigned *) arg_ptr;
+	arg_ptr += sizeof(task->coreid);
+
+	task->nb_interfaces = *(unsigned *) arg_ptr;
+	arg_ptr += sizeof(task->nb_interfaces);
+
+	/* The function needs an array pointing to each interface it needs
+	 * during execution. As in sink-side there is no mean to know which
+	 * kind of interface to expect, the array is composed of unions of
+	 * interfaces, thus we expect the same size anyway */
+	for (i = 0; i < task->nb_interfaces; i++)
+	{
+		union _starpu_interface * interface = malloc(sizeof(union _starpu_interface));   
+		memcpy(interface, arg_ptr, 
+				sizeof(union _starpu_interface));
+		task->interfaces[i] = interface;
+		arg_ptr += sizeof(union _starpu_interface);
+	}
+
+	/* Was cl_arg sent ? */
+	if (arg_size > arg_ptr - arg)
+		task->cl_arg = arg_ptr;
+	else
+		task->cl_arg = NULL;
+
+
+	//_STARPU_DEBUG("telling host that we have submitted the task %p.\n", task->kernel);
+	_starpu_mp_common_send_command(node, STARPU_EXECUTION_SUBMITTED,
+			NULL, 0);
+
+	//_STARPU_DEBUG("executing the task %p\n", task->kernel);
+	_starpu_sink_common_execute_thread(node, task);	
+
+}

+ 9 - 2
src/drivers/mp_common/sink_common.h

@@ -32,14 +32,21 @@ struct _starpu_sink_topology
 	unsigned nb_cpus;
 };
 
+struct arg_sink_thread
+{
+	struct _starpu_mp_node *node;
+	int coreid;
+};
+
 void _starpu_sink_common_worker(void);
 
-void _starpu_sink_common_execute(const struct _starpu_mp_node *node, void *arg, int arg_size);
-void _starpu_sink_nbcores (const struct _starpu_mp_node *node);
+void _starpu_sink_common_execute(struct _starpu_mp_node *node, void *arg, int arg_size);
 
 void _starpu_sink_common_allocate(const struct _starpu_mp_node *mp_node, void *arg, int arg_size);
 void _starpu_sink_common_free(const struct _starpu_mp_node *mp_node STARPU_ATTRIBUTE_UNUSED, void *arg, int arg_size);
 
+void* _starpu_sink_thread(void * thread_arg);
+
 #endif /* STARPU_USE_MP */
 
 

+ 432 - 64
src/drivers/mp_common/source_common.c

@@ -19,28 +19,241 @@
 #include <pthread.h>
 
 #include <starpu.h>
+#include <core/task.h>
+#include <core/sched_policy.h>
+
+#include <drivers/driver_common/driver_common.h>
+
+
 #include <datawizard/coherency.h>
 #include <datawizard/interfaces/data_interface.h>
 #include <drivers/mp_common/mp_common.h>
 
-int
-_starpu_src_common_sink_nbcores (const struct _starpu_mp_node *node, int *buf)
+
+/* Finalize the execution of a task by a worker*/
+static int _starpu_src_common_finalize_job (struct _starpu_job *j, struct _starpu_worker *worker) 
+{
+	uint32_t mask = 0;
+	int profiling = starpu_profiling_status_get();
+	struct timespec codelet_end;
+	_starpu_driver_end_job(worker, j, worker->perf_arch, &codelet_end, 0,
+			profiling);
+	
+	int count = worker->current_rank;
+
+	/* If it's a combined worker, we check if it's the last one of his combined */
+	if(j->task_size > 1)
+	{
+		struct _starpu_combined_worker * cb_worker = _starpu_get_combined_worker_struct(worker->combined_workerid); 
+
+		pthread_mutex_lock(&cb_worker->count_mutex);
+		count = cb_worker->count--;
+		if(count == 0)
+			cb_worker->count = cb_worker->worker_size - 1; 
+		pthread_mutex_unlock(&cb_worker->count_mutex);
+	}
+
+	/* Finalize the execution */
+	if(count == 0)
+	{
+
+		_starpu_driver_update_job_feedback(j, worker, worker->perf_arch,
+				&j->cl_start, &codelet_end,
+				profiling);
+
+		_starpu_push_task_output (j, mask);
+
+		_starpu_handle_job_termination(j);
+	}
+	return 0;
+}
+
+
+/* Complete the execution of the job */
+static int _starpu_src_common_process_completed_job(struct _starpu_worker_set *workerset, void * arg, int arg_size)
+{
+	int coreid;
+
+	STARPU_ASSERT(sizeof(coreid) == arg_size);	
+	
+	coreid = *(int *) arg;
+
+	struct _starpu_worker *worker = &workerset->workers[coreid];
+	struct _starpu_job *j = _starpu_get_job_associated_to_task(worker->current_task);
+
+	struct _starpu_worker * old_worker = _starpu_get_local_worker_key();
+
+	_starpu_set_local_worker_key(worker);
+	_starpu_src_common_finalize_job (j, worker);
+	_starpu_set_local_worker_key(old_worker);
+
+	worker->current_task = NULL;
+	return 0;
+}
+
+/* Tell the scheduler when the execution has begun */
+static void _starpu_src_common_pre_exec(void * arg, int arg_size)
+{
+	int cb_workerid, i;
+	STARPU_ASSERT(sizeof(cb_workerid) == arg_size);
+	cb_workerid = *(int *) arg;
+	struct _starpu_combined_worker *combined_worker = _starpu_get_combined_worker_struct(cb_workerid);
+	for(i=0; i < combined_worker->worker_size; i++)
+	{
+		struct _starpu_worker * worker = _starpu_get_worker_struct(combined_worker->combined_workerid[i]);
+		_starpu_set_local_worker_key(worker);
+		_starpu_sched_pre_exec_hook(worker->current_task);
+	}	
+}
+
+/* recv a message and handle asynchronous message
+ * return 0 if the message has not been handle (it's certainly mean that it's a synchronous message)
+ * return 1 if the message has been handle
+ */
+static int _starpu_src_common_handle_async(const struct _starpu_mp_node *node STARPU_ATTRIBUTE_UNUSED, 
+		void * arg, int arg_size, 
+		enum _starpu_mp_command answer)
+{
+	struct _starpu_worker_set * worker_set=NULL; 
+	switch(answer) 
+	{
+		case STARPU_EXECUTION_COMPLETED:
+			worker_set = _starpu_get_worker_struct(starpu_worker_get_id())->set;
+			_starpu_src_common_process_completed_job(worker_set, arg, arg_size);
+			break;
+		case STARPU_PRE_EXECUTION:
+			_starpu_src_common_pre_exec(arg,arg_size);
+			break;
+		default:
+			return 0;
+			break;
+	}
+	return 1;
+}
+
+/* Handle all message which have been stored in the message_queue */
+static void _starpu_src_common_handle_stored_async(struct _starpu_mp_node *node)
+{
+	STARPU_PTHREAD_MUTEX_LOCK(&node->message_queue_mutex);
+	/* while the list is not empty */
+	while(!mp_message_list_empty(node->message_queue))
+	{
+		/* We pop a message and handle it */
+		struct mp_message * message = mp_message_list_pop_back(node->message_queue);
+		_starpu_src_common_handle_async(node, message->buffer, 
+				message->size, message->type);
+		mp_message_delete(message);
+	}
+	STARPU_PTHREAD_MUTEX_UNLOCK(&node->message_queue_mutex);
+}
+
+/* Store a message if is asynchronous 
+ * return 1 if the message has been stored
+ * return 0 if the message is unknown or synchrone */
+int _starpu_src_common_store_message(struct _starpu_mp_node *node, 
+		void * arg, int arg_size, enum _starpu_mp_command answer)
+{
+	struct mp_message * message = NULL;
+	switch(answer)
+	{
+		case STARPU_EXECUTION_COMPLETED:
+		case STARPU_PRE_EXECUTION:
+			message = mp_message_new();
+			message->type = answer;
+			memcpy(message->buffer, arg, arg_size); 
+			message->size = arg_size; 
+
+			STARPU_PTHREAD_MUTEX_LOCK(&node->message_queue_mutex);
+			mp_message_list_push_front(node->message_queue,message);
+			STARPU_PTHREAD_MUTEX_UNLOCK(&node->message_queue_mutex);
+			return 1;
+			break;
+		default:
+			return 0;
+			break;
+	}
+}
+
+/* Store all asynchronous messages and return when a synchronous message is received */
+static enum _starpu_mp_command _starpu_src_common_wait_command_sync(struct _starpu_mp_node *node, 
+		void ** arg, int* arg_size)
+{
+	enum _starpu_mp_command answer;
+	int is_sync = 0;
+	while(!is_sync)
+	{
+		answer = _starpu_mp_common_recv_command(node, arg, arg_size);
+		if(!_starpu_src_common_store_message(node,*arg,*arg_size,answer))
+			is_sync=1;
+	}
+	return answer;
+}
+
+/* Handle a asynchrone message and return a error if a synchronous message is received */
+static void _starpu_src_common_recv_async(struct _starpu_mp_node * node)
+{
+	enum _starpu_mp_command answer;
+	void *arg;
+	int arg_size;
+	answer = _starpu_mp_common_recv_command(node, &arg, &arg_size);
+	if(!_starpu_src_common_handle_async(node,arg,arg_size,answer))
+	{
+		printf("incorrect commande: unknown command or sync command");
+		STARPU_ASSERT(0);
+	}	
+}
+
+/* Handle all asynchrone message while a completed execution message from a specific worker has been receive */
+ enum _starpu_mp_command _starpu_src_common_wait_completed_execution(struct _starpu_mp_node *node, int devid, void **arg, int * arg_size)
+{
+	enum _starpu_mp_command answer;
+
+	int completed = 0;	
+	/*While the waited completed execution message has not been receive*/
+	while(!completed)
+	{
+		answer = _starpu_mp_common_recv_command (node, arg, arg_size);
+
+		if(answer == STARPU_EXECUTION_COMPLETED)
+		{
+			int coreid;
+			STARPU_ASSERT(sizeof(coreid) == *arg_size);	
+			coreid = *(int *) *arg;
+			if(devid == coreid)
+				completed = 1;
+			else
+				if(!_starpu_src_common_store_message(node, *arg, *arg_size, answer))
+					/* We receive a unknown or asynchronous message  */
+					STARPU_ASSERT(0);
+		}
+		else
+		{
+			if(!_starpu_src_common_store_message(node, *arg, *arg_size, answer))
+				/* We receive a unknown or asynchronous message  */
+				STARPU_ASSERT(0);
+		}
+	}
+	return answer;
+}
+
+
+/* Send a request to the sink NODE for the number of cores on it. */
+int _starpu_src_common_sink_nbcores (const struct _starpu_mp_node *node, int *buf)
 {
-    // Send a request to the sink NODE for the number of cores on it.
 
-    enum _starpu_mp_command answer;
-    void *arg;
-    int arg_size = sizeof (int);
+	enum _starpu_mp_command answer;
+	void *arg;
+	int arg_size = sizeof (int);
 
-    _starpu_mp_common_send_command (node, STARPU_SINK_NBCORES, NULL, 0);
+	_starpu_mp_common_send_command (node, STARPU_SINK_NBCORES, NULL, 0);
 
-    answer = _starpu_mp_common_recv_command (node, &arg, &arg_size);
+	answer = _starpu_mp_common_recv_command (node, &arg, &arg_size);
 
-    STARPU_ASSERT (answer == STARPU_ANSWER_SINK_NBCORES && arg_size == sizeof (int));
+	STARPU_ASSERT (answer == STARPU_ANSWER_SINK_NBCORES && arg_size == sizeof (int));
 
-    memcpy (buf, arg, arg_size);
+	memcpy (buf, arg, arg_size);
 
-    return 0;
+	return 0;
 }
 
 /* Send a request to the sink linked to NODE for the pointer to the
@@ -49,7 +262,7 @@ _starpu_src_common_sink_nbcores (const struct _starpu_mp_node *node, int *buf)
  * else it returns -ESPIPE if the function was not found.
  */
 int _starpu_src_common_lookup(struct _starpu_mp_node *node,
-			      void (**func_ptr)(void), const char *func_name)
+		void (**func_ptr)(void), const char *func_name)
 {
 	enum _starpu_mp_command answer;
 	void *arg;
@@ -60,19 +273,21 @@ int _starpu_src_common_lookup(struct _starpu_mp_node *node,
 
 	//_STARPU_DEBUG("Looking up %s\n", func_name);
 	_starpu_mp_common_send_command(node, STARPU_LOOKUP, (void *) func_name,
-				       arg_size);
-	answer = _starpu_mp_common_recv_command(node, (void **) &arg,
-						&arg_size);
+			arg_size);
+
+	answer = _starpu_src_common_wait_command_sync(node, (void **) &arg,
+			&arg_size);
 
-	if (answer == STARPU_ERROR_LOOKUP) {
+	if (answer == STARPU_ERROR_LOOKUP) 
+	{
 		_STARPU_DISP("Error looking up symbol %s\n", func_name);
 		return -ESPIPE;
 	}
 
 	/* We have to be sure the device answered the right question and the
 	 * answer has the right size */
-	STARPU_ASSERT(answer == STARPU_ANSWER_LOOKUP &&
-		      arg_size == sizeof(*func_ptr));
+	STARPU_ASSERT(answer == STARPU_ANSWER_LOOKUP);
+	STARPU_ASSERT(arg_size == sizeof(*func_ptr));
 
 	memcpy(func_ptr, arg, arg_size);
 
@@ -81,32 +296,46 @@ int _starpu_src_common_lookup(struct _starpu_mp_node *node,
 	return 0;
 }
 
- /* Send a message to the sink to execute a kernel.
+/* Send a message to the sink to execute a kernel.
  * The message sent has the form below :
  * [Function pointer on sink, number of interfaces, interfaces
  * (union _starpu_interface), cl_arg]
  */
-int _starpu_src_common_execute_kernel(const struct _starpu_mp_node *node,
-				      void (*kernel)(void), unsigned coreid,
-				      starpu_data_handle_t *handles,
-				      void **interfaces,
-				      unsigned nb_interfaces,
-				      void *cl_arg, size_t cl_arg_size)
+/* Launch the execution of the function KERNEL points to on the sink linked
+ * to NODE. Returns 0 in case of success, -EINVAL if kernel is an invalid
+ * pointer.
+ * Data interfaces in task are send to the sink.
+ */
+int _starpu_src_common_execute_kernel(struct _starpu_mp_node *node,
+		void (*kernel)(void), unsigned coreid,
+		enum starpu_codelet_type type,
+		int is_parallel_task, int cb_workerid,
+		starpu_data_handle_t *handles,
+		void **interfaces,
+		unsigned nb_interfaces,
+		void *cl_arg, size_t cl_arg_size)
 {
-	unsigned id;
-	void *buffer, *buffer_ptr, *arg = NULL;
-	int buffer_size = 0, arg_size = 0;
+
+	void *buffer, *buffer_ptr, *arg =NULL;
+	int buffer_size = 0, arg_size =0;
+	unsigned i;
+
+	buffer_size = sizeof(kernel) + sizeof(coreid) + sizeof(type)
+		+ sizeof(nb_interfaces) + nb_interfaces * sizeof(union _starpu_interface) + sizeof(is_parallel_task);
+	
+	/*if the task is paralle*/
+	if(is_parallel_task)
+	{
+		buffer_size += sizeof(cb_workerid); 
+	}
 
 	/* If the user didn't give any cl_arg, there is no need to send it */
-	buffer_size =
-	    sizeof(kernel) + sizeof(coreid) + sizeof(nb_interfaces) +
-	    nb_interfaces * sizeof(union _starpu_interface);
 	if (cl_arg)
 	{
 		STARPU_ASSERT(cl_arg_size);
 		buffer_size += cl_arg_size;
 	}
-
+	
 	/* We give to send_command a buffer we just allocated, which contains
 	 * a pointer to the function (sink-side), core on which execute this
 	 * function (sink-side), number of interfaces we send,
@@ -116,6 +345,18 @@ int _starpu_src_common_execute_kernel(const struct _starpu_mp_node *node,
 	*(void(**)(void)) buffer = kernel;
 	buffer_ptr += sizeof(kernel);
 
+	*(enum starpu_codelet_type *) buffer_ptr = type;
+	buffer_ptr += sizeof(type);
+
+	*(int *) buffer_ptr = is_parallel_task;
+	buffer_ptr += sizeof(is_parallel_task);
+
+	if(is_parallel_task)
+	{
+		*(int *) buffer_ptr = cb_workerid ;
+		buffer_ptr += sizeof(cb_workerid);
+	}
+
 	*(unsigned *) buffer_ptr = coreid;
 	buffer_ptr += sizeof(coreid);
 
@@ -126,11 +367,13 @@ int _starpu_src_common_execute_kernel(const struct _starpu_mp_node *node,
 	 * executed on a sink with a different memory, whereas a codelet is
 	 * executed on the host part for the other accelerators.
 	 * Thus we need to send a copy of each interface on the MP device */
-	for (id = 0; id < nb_interfaces; id++)
+
+	for (i = 0; i < nb_interfaces; i++)
 	{
-		starpu_data_handle_t handle = handles[id];
-		memcpy (buffer_ptr, interfaces[id],
-			handle->ops->interface_size);
+		starpu_data_handle_t handle = handles[i];
+
+		memcpy (buffer_ptr, interfaces[i],
+				handle->ops->interface_size);
 		/* The sink side has no mean to get the type of each
 		 * interface, we use a union to make it generic and permit the
 		 * sink to go through the array */
@@ -141,7 +384,7 @@ int _starpu_src_common_execute_kernel(const struct _starpu_mp_node *node,
 		memcpy(buffer_ptr, cl_arg, cl_arg_size);
 
 	_starpu_mp_common_send_command(node, STARPU_EXECUTE, buffer, buffer_size);
-	enum _starpu_mp_command answer = _starpu_mp_common_recv_command(node, &arg, &arg_size);
+	enum _starpu_mp_command answer = _starpu_src_common_wait_command_sync(node, &arg, &arg_size);
 
 	if (answer == STARPU_ERROR_EXECUTE)
 		return -EINVAL;
@@ -151,46 +394,76 @@ int _starpu_src_common_execute_kernel(const struct _starpu_mp_node *node,
 	free(buffer);
 
 	return 0;
-
 }
 
-/* Launch the execution of the function KERNEL points to on the sink linked
- * to NODE. Returns 0 in case of success, -EINVAL if kernel is an invalid
- * pointer.
- * Data interfaces in task are send to the sink.
- */
-int _starpu_src_common_execute_kernel_from_task(const struct _starpu_mp_node *node,
-						void (*kernel)(void), unsigned coreid,
-						struct starpu_task *task)
+
+/* Get the information and call the function to send to the sink a message to execute the task*/
+static int _starpu_src_common_execute(struct _starpu_job *j, 
+		struct _starpu_worker *worker, 
+		struct _starpu_mp_node * node)
 {
-    return _starpu_src_common_execute_kernel(node, kernel, coreid,
-					     task->handles, task->interfaces, task->cl->nbuffers,
-					     task->cl_arg, task->cl_arg_size);
+	int ret;
+	uint32_t mask = 0;
+
+	STARPU_ASSERT(j);
+	struct starpu_task *task = j->task;
+
+	int profiling = starpu_profiling_status_get();
+
+	STARPU_ASSERT(task);
+	if (worker->current_rank == 0) 
+	{
+		ret = _starpu_fetch_task_input(j, mask);
+		if (ret != 0)
+		{
+			/* there was not enough memory, so the input of
+			 * the codelet cannot be fetched ... put the
+			 * codelet back, and try it later */
+			return -EAGAIN;
+		}
+	}
+
+	void (*kernel)(void)  = node->get_kernel_from_job(node,j);
+
+	_starpu_driver_start_job(worker, j, &j->cl_start, 0, profiling);
+
+
+	//_STARPU_DEBUG("\nworkerid:%d, rank:%d, type:%d,	cb_workerid:%d, task_size:%d\n\n",worker->devid,worker->current_rank,task->cl->type,j->combined_workerid,j->task_size);
+
+	_starpu_src_common_execute_kernel(node, kernel, worker->devid, task->cl->type,
+			(j->task_size > 1),
+			j->combined_workerid, task->handles,
+			task->interfaces, task->cl->nbuffers,
+			task->cl_arg, task->cl_arg_size);
+
+
+	return 0;
 }
 
+
 /* Send a request to the sink linked to the MP_NODE to allocate SIZE bytes on
  * the sink.
  * In case of success, it returns 0 and *ADDR contains the address of the
  * allocated area ;
  * else it returns 1 if the allocation fail.
  */
-int _starpu_src_common_allocate(const struct _starpu_mp_node *mp_node,
-								void **addr, size_t size)
+int _starpu_src_common_allocate(struct _starpu_mp_node *mp_node,
+		void **addr, size_t size)
 {
 	enum _starpu_mp_command answer;
 	void *arg;
 	int arg_size;
 
 	_starpu_mp_common_send_command(mp_node, STARPU_ALLOCATE, &size,
-								   sizeof(size));
+			sizeof(size));
 
-	answer = _starpu_mp_common_recv_command(mp_node, &arg, &arg_size);
+	answer = _starpu_src_common_wait_command_sync(mp_node, &arg, &arg_size);
 
 	if (answer == STARPU_ERROR_ALLOCATE)
 		return 1;
 
 	STARPU_ASSERT(answer == STARPU_ANSWER_ALLOCATE &&
-				  arg_size == sizeof(*addr));
+			arg_size == sizeof(*addr));
 
 	memcpy(addr, arg, arg_size);
 
@@ -201,15 +474,15 @@ int _starpu_src_common_allocate(const struct _starpu_mp_node *mp_node,
  * area pointed by ADDR.
  */
 void _starpu_src_common_free(const struct _starpu_mp_node *mp_node,
-							 void *addr)
+		void *addr)
 {
 	_starpu_mp_common_send_command(mp_node, STARPU_FREE, &addr, sizeof(addr));
 }
 
 /* Send SIZE bytes pointed by SRC to DST on the sink linked to the MP_NODE.
- */
+*/
 int _starpu_src_common_copy_host_to_sink(const struct _starpu_mp_node *mp_node,
-										 void *src, void *dst, size_t size)
+		void *src, void *dst, size_t size)
 {
 	struct _starpu_mp_transfer_command cmd = {size, dst};
 
@@ -220,9 +493,9 @@ int _starpu_src_common_copy_host_to_sink(const struct _starpu_mp_node *mp_node,
 }
 
 /* Receive SIZE bytes pointed by SRC on the sink linked to the MP_NODE and store them in DST.
- */
+*/
 int _starpu_src_common_copy_sink_to_host(const struct _starpu_mp_node *mp_node,
-										 void *src, void *dst, size_t size)
+		void *src, void *dst, size_t size)
 {
 	struct _starpu_mp_transfer_command cmd = {size, src};
 
@@ -265,8 +538,8 @@ int _starpu_src_common_copy_sink_to_sink(const struct _starpu_mp_node *src_node,
 /* 5 functions to determine the executable to run on the device (MIC, SCC,
  * MPI).
  */
-static void _starpu_src_common_cat_3(char *final, const char *first, const char *second,
-										  const char *third)
+static void _starpu_src_common_cat_3(char *final, const char *first, 
+		const char *second, const char *third)
 {
 	strcpy(final, first);
 	strcat(final, second);
@@ -304,9 +577,9 @@ static int _starpu_src_common_test_suffixes(char *located_file_name, const char
 }
 
 int _starpu_src_common_locate_file(char *located_file_name,
-							const char *env_file_name, const char *env_mic_path,
-							const char *config_file_name, const char *actual_file_name,
-							const char **suffixes)
+		const char *env_file_name, const char *env_mic_path,
+		const char *config_file_name, const char *actual_file_name,
+		const char **suffixes)
 {
 	if (env_file_name != NULL)
 	{
@@ -372,3 +645,98 @@ int _starpu_src_common_locate_file(char *located_file_name,
 
 	return 1;
 }
+
+/* Send workers to the sink node 
+ */
+static void _starpu_src_common_send_workers(struct _starpu_mp_node * node, int baseworkerid, int nworkers)
+{	
+	struct _starpu_machine_config *config = _starpu_get_machine_config();
+	int worker_size = sizeof(struct _starpu_worker)*nworkers;	
+	int combined_worker_size = STARPU_NMAX_COMBINEDWORKERS*sizeof(struct _starpu_combined_worker);
+	int msg[5];
+	msg[0] = nworkers;
+	msg[1] = worker_size;
+	msg[2] = combined_worker_size;
+	msg[3] = baseworkerid;
+	msg[4] = starpu_worker_get_count();
+
+	/* tell the sink node that we will send him all workers */
+	_starpu_mp_common_send_command(node, STARPU_SYNC_WORKERS, 
+			&msg, sizeof(msg));
+
+	/* Send all worker to the sink node */
+	node->dt_send(node,&config->workers[baseworkerid],worker_size);
+
+	/* Send all combined workers to the sink node */
+	node->dt_send(node, &config->combined_workers,combined_worker_size);
+}	
+
+/* Function looping on the source node */
+void _starpu_src_common_worker(struct _starpu_worker_set * worker_set, 
+		unsigned baseworkerid, 
+		struct _starpu_mp_node * mp_node)
+{ 
+	unsigned memnode = worker_set->workers[0].memory_node;
+	struct starpu_task **tasks = malloc(sizeof(struct starpu_task *)*worker_set->nworkers);
+
+	_starpu_src_common_send_workers(mp_node, baseworkerid, worker_set->nworkers);
+
+	/*main loop*/
+	while (_starpu_machine_is_running())
+	{
+		int res;
+		struct _starpu_job * j;
+
+		_STARPU_TRACE_START_PROGRESS(memnode);
+		_starpu_datawizard_progress(memnode, 1);
+		_STARPU_TRACE_END_PROGRESS(memnode);
+
+		/* Handle message which have been store */
+		_starpu_src_common_handle_stored_async(mp_node);
+
+		/* poll the device for completed jobs.*/
+		while(mp_node->mp_recv_is_ready(mp_node))
+			_starpu_src_common_recv_async(mp_node);
+
+		/* get task for each worker*/
+		res = _starpu_get_multi_worker_task(worker_set->workers, tasks, worker_set->nworkers);
+
+		/*if at least one worker have pop a task*/
+		if(res != 0)
+		{
+			unsigned i;
+			for(i=0; i<worker_set->nworkers; i++)
+			{
+				if(tasks[i] != NULL)
+				{
+					j = _starpu_get_job_associated_to_task(tasks[i]);
+					_starpu_set_local_worker_key(&worker_set->workers[i]);
+					res =  _starpu_src_common_execute(j, &worker_set->workers[i], mp_node);
+					switch (res)
+					{
+						case 0:
+							/* The task task has been launched with no error */
+							break;
+						case -EAGAIN:
+							_STARPU_DISP("ouch, Xeon Phi could not actually run task %p, putting it back...\n", tasks[i]);
+							_starpu_push_task_to_workers(tasks[i]);
+							STARPU_ABORT();
+							continue;
+							break;
+						default:
+							STARPU_ASSERT(0);
+					}
+				}
+			}
+		}
+	}
+	free(tasks);
+
+	_starpu_handle_all_pending_node_data_requests(memnode);
+
+	/* In case there remains some memory that was automatically
+	 * allocated by StarPU, we release it now. Note that data
+	 * coherency is not maintained anymore at that point ! */
+	_starpu_free_all_automatically_allocated_buffers(memnode);
+
+}

+ 28 - 9
src/drivers/mp_common/source_common.h

@@ -21,28 +21,42 @@
 
 #ifdef STARPU_USE_MP
 
+#include <core/sched_policy.h>
+#include <core/task.h>
 #include <drivers/mp_common/mp_common.h>
 
+
+enum _starpu_mp_command _starpu_src_common_wait_command_sync(struct _starpu_mp_node *node, 
+							     void ** arg, int* arg_size);
+void _starpu_src_common_recv_async(struct _starpu_worker_set *worker_set, 
+				   struct _starpu_mp_node * baseworker_node);
+
+int _starpu_src_common_store_message(struct _starpu_mp_node *node, 
+		void * arg, int arg_size, enum _starpu_mp_command answer);
+
+enum _starpu_mp_command _starpu_src_common_wait_completed_execution(struct _starpu_mp_node *node, int devid, void **arg, int * arg_size);
+
 int _starpu_src_common_sink_nbcores (const struct _starpu_mp_node *node, int *buf);
 
 int _starpu_src_common_lookup(const struct _starpu_mp_node *node,
 			      void (**func_ptr)(void), const char *func_name);
 
-int _starpu_src_common_execute_kernel(const struct _starpu_mp_node *node,
-				      void (*kernel)(void), unsigned coreid,
-				      starpu_data_handle_t *handles, void **interfaces, unsigned nb_interfaces,
-				      void *cl_arg, size_t cl_arg_size);
-
-int _starpu_src_common_execute_kernel_from_task(const struct _starpu_mp_node *node,
-						void (*kernel)(void), unsigned coreid,
-						struct starpu_task *task);
-
 int _starpu_src_common_allocate(const struct _starpu_mp_node *mp_node,
 				void **addr, size_t size);
 
 void _starpu_src_common_free(const struct _starpu_mp_node *mp_node,
 			     void *addr);
 
+int _starpu_src_common_execute_kernel(const struct _starpu_mp_node *node,
+				      void (*kernel)(void), unsigned coreid,
+				      enum starpu_codelet_type type,
+				      int is_parallel_task, int cb_workerid,
+				      starpu_data_handle_t *handles,
+				      void **interfaces,
+				      unsigned nb_interfaces,
+				      void *cl_arg, size_t cl_arg_size);
+
+
 int _starpu_src_common_copy_host_to_sink(const struct _starpu_mp_node *mp_node,
 					 void *src, void *dst, size_t size);
 
@@ -57,6 +71,11 @@ int _starpu_src_common_locate_file(char *located_file_name,
 				   const char *config_file_name, const char *actual_file_name,
 				   const char **suffixes);
 
+void _starpu_src_common_worker(struct _starpu_worker_set * worker_set, 
+			       unsigned baseworkerid, 
+			       struct _starpu_mp_node * node_set);
+
+
 #endif /* STARPU_USE_MP */
 
 

+ 8 - 0
src/drivers/scc/driver_scc_common.c

@@ -172,3 +172,11 @@ void _starpu_scc_common_report_rcce_error(const char *func, const char *file, co
 	fprintf(stderr, "RCCE error in %s (%s:%d): %s\n", func, file, line, error_string); 
 	STARPU_ABORT();
 }
+
+int _starpu_scc_common_recv_is_ready(const struct _starpu_mp_node *mp_node)
+{
+  /***********
+      TODO
+  ************/
+  STARPU_ASSERT(0);
+}

+ 2 - 0
src/drivers/scc/driver_scc_common.h

@@ -44,6 +44,8 @@ void _starpu_scc_common_recv(const struct _starpu_mp_node *node, void *msg, int
 
 void _starpu_scc_common_report_rcce_error(const char *func, const char *file, const int line, const int err_no);
 
+int _starpu_scc_common_recv_is_ready(const struct _starpu_mp_node *mp_node);
+
 #endif /* STARPU_USE_SCC */
 
 

+ 34 - 0
src/drivers/scc/driver_scc_sink.c

@@ -16,6 +16,7 @@
 
 
 #include <RCCE.h>
+#include <dlfcn.h>
 
 #include <datawizard/interfaces/data_interface.h>
 #include <drivers/mp_common/sink_common.h>
@@ -27,6 +28,23 @@
 void _starpu_scc_sink_init(struct _starpu_mp_node *node)
 {
 	node->mp_connection.scc_nodeid = _starpu_scc_common_get_src_node_id();
+
+	/****************
+	 *     TODO     *
+	 * get nb_cores *
+	 ****************/
+	node->nb_cores = 1; 
+	STARPU_ASSERT(0);
+
+}
+
+void _starpu_scc_sink_launch_workers(struct _starpu_mp_node *node)
+{
+	/*****************
+	 *     TODO      *
+	 * init thread   *
+	 *****************/
+	STARPU_ASSERT(0);
 }
 
 void _starpu_scc_sink_deinit(struct _starpu_mp_node *node)
@@ -51,6 +69,15 @@ void _starpu_scc_sink_recv_from_device(const struct _starpu_mp_node *node, int s
 		STARPU_MP_COMMON_REPORT_ERROR(node, ret);
 }
 
+void _starpu_scc_sink_bind_thread(const struct _starpu_mp_node *mp_node STARPU_ATTRIBUTE_UNUSED, cpu_set_t * cpuset, int coreid, pthread_t *thread)
+{
+	/****************
+	 *     TODO     *
+	 ****************/
+	STARPU_ASSERT(0);
+}
+
+
 /* arg -> [Function pointer on sink, number of interfaces, interfaces
  * (union _starpu_interface), cl_arg]
  *
@@ -124,3 +151,10 @@ void _starpu_scc_sink_execute(const struct _starpu_mp_node *node, void *arg, int
 
 	_starpu_sink_common_execute(node, arg, arg_size);
 }
+
+void (*_starpu_scc_sink_lookup (const struct _starpu_mp_node * node STARPU_ATTRIBUTE_UNUSED, char* func_name))(void)
+{
+	void *dl_handle = dlopen(NULL, RTLD_NOW);
+	return dlsym(dl_handle, func_name);
+}
+

+ 5 - 0
src/drivers/scc/driver_scc_sink.h

@@ -25,13 +25,18 @@
 #include <drivers/mp_common/mp_common.h>
 
 void _starpu_scc_sink_init(struct _starpu_mp_node *node);
+void _starpu_scc_sink_launch_workers(struct _starpu_mp_node *node);
 void _starpu_scc_sink_deinit(struct _starpu_mp_node *node);
 
 void _starpu_scc_sink_send_to_device(const struct _starpu_mp_node *node, int dst_devid, void *msg, int len);
 void _starpu_scc_sink_recv_from_device(const struct _starpu_mp_node *node, int src_devid, void *msg, int len);
 
+void _starpu_scc_sink_bind_thread(const struct _starpu_mp_node *mp_node STARPU_ATTRIBUTE_UNUSED, cpu_set_t * cpuset, int coreid, pthread_t *thread);
+
 void _starpu_scc_sink_execute(const struct _starpu_mp_node *node, void *arg, int arg_size);
 
+void (*_starpu_scc_sink_lookup (const struct _starpu_mp_node * node STARPU_ATTRIBUTE_UNUSED, char* func_name))(void);
+
 #endif /* STARPU_USE_SCC */
 
 

+ 28 - 121
src/drivers/scc/driver_scc_source.c

@@ -60,79 +60,40 @@ static void _starpu_scc_src_deinit_context(int devid)
 
 	_starpu_mp_common_node_destroy(scc_mp_nodes[devid]);
 }
-
-static int _starpu_scc_src_execute_job(struct _starpu_job *j, struct _starpu_worker *args)
+void (*_starpu_scc_src_get_kernel_from_job(const struct _starpu_mp_node *,struct _starpu_job *j))(void)
 {
-	int ret;
-	uint32_t mask = 0;
-
-	STARPU_ASSERT(j);
-	struct starpu_task *task = j->task;
-
-	struct timespec codelet_start, codelet_end;
-
-	int profiling = starpu_profiling_status_get();
-	unsigned calibrate_model = 0;
-
-	STARPU_ASSERT(task);
-	struct starpu_codelet *cl = task->cl;
-	STARPU_ASSERT(cl);
-
-	if (cl->model && cl->model->benchmarking)
-		calibrate_model = 1;
-
-	ret = _starpu_fetch_task_input(j, mask);
-	if (ret != 0)
+  starpu_scc_kernel_t kernel = NULL;
+
+  starpu_scc_func_t func = _starpu_task_get_scc_nth_implementation(j->task->cl, j->nimpl);
+  if (func)
+    {
+      /* We execute the function contained in the codelet, it must return a
+       * pointer to the function to execute on the device, either specified
+       * directly by the user or by a call to starpu_scc_get_kernel().
+       */
+      kernel = func();
+    }
+  else
+    {
+      /* If user doesn't define any starpu_scc_func_t in cl->scc_funcs we try to use
+       * cpu_funcs_name.
+       */
+      char *func_name = _starpu_task_get_cpu_name_nth_implementation(j->task->cl, j->nimpl);
+      if (func_name)
 	{
-		/* there was not enough memory, so the input of
-		 * the codelet cannot be fetched ... put the
-		 * codelet back, and try it later */
-		return -EAGAIN;
-	}
-
+	  starpu_scc_func_symbol_t symbol;
 
-	starpu_scc_kernel_t kernel = NULL;
+	  _starpu_scc_src_register_kernel(&symbol, func_name);
 
-	starpu_scc_func_t func = _starpu_task_get_scc_nth_implementation(j->task->cl, j->nimpl);
-	if (func)
-	{
-		/* We execute the function contained in the codelet, it must return a
-		 * pointer to the function to execute on the device, either specified
-		 * directly by the user or by a call to starpu_scc_get_kernel().
-		 */
-		kernel = func();
+	  kernel = _starpu_scc_src_get_kernel(symbol);
 	}
-	else
-	{
-		/* If user doesn't define any starpu_scc_func_t in cl->scc_funcs we try to use
-		 * cpu_funcs_name.
-		 */
-		char *func_name = _starpu_task_get_cpu_name_nth_implementation(j->task->cl, j->nimpl);
-		if (func_name)
-		{
-			starpu_scc_func_symbol_t symbol;
-
-			_starpu_scc_src_register_kernel(&symbol, func_name);
-
-			kernel = _starpu_scc_src_get_kernel(symbol);
-		}
-	}
-	STARPU_ASSERT(kernel);
-
-	_starpu_driver_start_job(args, j, &codelet_start, 0, profiling);
-
-	_starpu_src_common_execute_kernel_from_task(scc_mp_nodes[args->devid], (void (*)(void)) kernel, 0, task);
-
-	_starpu_driver_end_job(args, j, args->perf_arch, &codelet_end, 0, profiling);
-
-	_starpu_driver_update_job_feedback(j, args, args->perf_arch, &codelet_start, &codelet_end, profiling);
-
-	_starpu_push_task_output(j, mask);
+    }
+  STARPU_ASSERT(kernel);  
 
-
-	return 0;
+  return (void (*)(void))kernel;
 }
 
+
 void _starpu_scc_src_mp_deinit()
 {
 	_starpu_scc_common_unmap_shared_memory();
@@ -320,7 +281,7 @@ int _starpu_scc_copy_sink_to_sink(void *src, unsigned src_node, void *dst, unsig
 
 void *_starpu_scc_src_worker(void *arg)
 {
-	struct _starpu_worker *args = arg;
+	struct _starpu_worker_set *args = arg;
 
 	int devid = args->devid;
 	int workerid = args->workerid;
@@ -350,64 +311,10 @@ void *_starpu_scc_src_worker(void *arg)
 	STARPU_PTHREAD_COND_SIGNAL(&args->ready_cond);
 	STARPU_PTHREAD_MUTEX_UNLOCK(&args->mutex);
 
-	struct _starpu_job * j;
-	struct starpu_task *task;
-	int res;
-
-	while (_starpu_machine_is_running())
-	{
-		_STARPU_TRACE_START_PROGRESS(memnode);
-		_starpu_datawizard_progress(memnode, 1);
-		_STARPU_TRACE_END_PROGRESS(memnode);
-
-		task = _starpu_get_worker_task(args, workerid, memnode);
-		if (!task)
-			continue;
-
-		j = _starpu_get_job_associated_to_task(task);
-
-		/* can a SCC device do that task ? */
-		if (!_STARPU_SCC_MAY_PERFORM(j))
-		{
-			/* this isn't a SCC task */
-			_starpu_push_task_to_workers(task);
-			continue;
-		}
-
-		_starpu_set_current_task(task);
-		args->current_task = j->task;
-
-		res = _starpu_scc_src_execute_job(j, args);
-
-		_starpu_set_current_task(NULL);
-		args->current_task = NULL;
-
-		if (res)
-		{
-			switch (res)
-			{
-				case -EAGAIN:
-					_STARPU_DISP("ouch, SCC could not actually run task %p, putting it back...\n", task);
-					_starpu_push_task_to_workers(task);
-					STARPU_ABORT();
-					continue;
-				default:
-					STARPU_ASSERT(0);
-			}
-		}
-
-		_starpu_handle_job_termination(j);
-	}
+	_starpu_src_common_worker(args, baseworkerid, scc_mp_nodes[mp_nodeid]);
 
 	_STARPU_TRACE_WORKER_DEINIT_START;
 
-	_starpu_handle_all_pending_node_data_requests(memnode);
-
-	/* In case there remains some memory that was automatically
-	 * allocated by StarPU, we release it now. Note that data
-	 * coherency is not maintained anymore at that point ! */
-	_starpu_free_all_automatically_allocated_buffers(memnode);
-
 	_starpu_scc_src_deinit_context(args->devid);
 
 	_STARPU_TRACE_WORKER_DEINIT_END(_STARPU_FUT_SCC_KEY);

+ 1 - 0
src/drivers/scc/driver_scc_source.h

@@ -29,6 +29,7 @@
 
 void _starpu_scc_src_mp_deinit();
 
+void (*_starpu_scc_src_get_kernel_from_job(const struct _starpu_mp_node *,struct _starpu_job *j))(void);
 int _starpu_scc_src_register_kernel(starpu_scc_func_symbol_t *symbol, const char *func_name);
 starpu_scc_kernel_t _starpu_scc_src_get_kernel(starpu_scc_func_symbol_t symbol);
 

+ 16 - 18
src/sched_policies/parallel_eager.c

@@ -29,8 +29,8 @@ struct _starpu_peager_data
         starpu_pthread_mutex_t policy_mutex;
 };
 
-#define STARPU_NMAXCOMBINED_WORKERS 10
-/* XXX instead of 10, we should use some "MAX combination .."*/
+#define STARPU_NMAXCOMBINED_WORKERS 520 
+/* instead of STARPU_NMAXCOMBINED_WORKERS, we should use some "MAX combination .."*/
 static int possible_combinations_cnt[STARPU_NMAXWORKERS];
 static int possible_combinations[STARPU_NMAXWORKERS][STARPU_NMAXCOMBINED_WORKERS];
 static int possible_combinations_size[STARPU_NMAXWORKERS][STARPU_NMAXCOMBINED_WORKERS];
@@ -42,15 +42,11 @@ static int possible_combinations_size[STARPU_NMAXWORKERS][STARPU_NMAXCOMBINED_WO
 
 static void peager_add_workers(unsigned sched_ctx_id, int *workerids, unsigned nworkers)
 {
+	_starpu_sched_find_worker_combinations(workerids, nworkers);
 	struct _starpu_peager_data *data = (struct _starpu_peager_data*)starpu_sched_ctx_get_policy_data(sched_ctx_id);
 	unsigned nbasic_workers = starpu_worker_get_count();
-		
-	_starpu_sched_find_worker_combinations(workerids, nworkers);
-
+	unsigned ncombined_workers= starpu_combined_worker_get_count();
 	unsigned workerid, i;
-	unsigned ncombinedworkers;
-
-	ncombinedworkers = starpu_combined_worker_get_count();
 
 	/* Find the master of each worker. We first assign the worker as its
 	 * own master, and then iterate over the different worker combinations
@@ -67,7 +63,7 @@ static void peager_add_workers(unsigned sched_ctx_id, int *workerids, unsigned n
 	}
 
 
-	for (i = 0; i < ncombinedworkers; i++)
+	for (i = 0; i < ncombined_workers; i++)
 	{
 		workerid = nbasic_workers + i;
 
@@ -77,20 +73,17 @@ static void peager_add_workers(unsigned sched_ctx_id, int *workerids, unsigned n
 		starpu_combined_worker_get_description(workerid, &size, &workers);
 
 		int master = workers[0];
-
 		int j;
 		for (j = 0; j < size; j++)
 		{
 			if (data->master_id[workers[j]] > master)
 				data->master_id[workers[j]] = master;
-
 			int cnt = possible_combinations_cnt[workers[j]]++;
 			possible_combinations[workers[j]][cnt] = workerid;
 			possible_combinations_size[workers[j]][cnt] = size;
 		}
 	}
 
-
 	for(i = 0; i < nworkers; i++)
 	{
 		workerid = workerids[i];
@@ -176,9 +169,11 @@ static int push_task_peager_policy(struct starpu_task *task)
 	{
 		worker = workers->get_next(workers, &it);
 		int master = data->master_id[worker];
-		/* If this is not a CPU, then the worker simply grabs tasks from the fifo */
-		if ((!starpu_worker_is_combined_worker(worker) && starpu_worker_get_type(worker) != STARPU_CPU_WORKER)
-		    || (master == worker))
+		/* If this is not a CPU or a MIC, then the worker simply grabs tasks from the fifo */
+		if ((!starpu_worker_is_combined_worker(worker) && 
+		    starpu_worker_get_type(worker) != STARPU_MIC_WORKER &&
+		    starpu_worker_get_type(worker) != STARPU_CPU_WORKER)  
+			|| (master == worker))
 		{
 			starpu_pthread_mutex_t *sched_mutex;
 			starpu_pthread_cond_t *sched_cond;
@@ -198,8 +193,8 @@ static struct starpu_task *pop_task_peager_policy(unsigned sched_ctx_id)
 
 	int workerid = starpu_worker_get_id();
 
-	/* If this is not a CPU, then the worker simply grabs tasks from the fifo */
-	if (starpu_worker_get_type(workerid) != STARPU_CPU_WORKER)
+	/* If this is not a CPU or a MIC, then the worker simply grabs tasks from the fifo */
+	if (starpu_worker_get_type(workerid) != STARPU_CPU_WORKER && starpu_worker_get_type(workerid) != STARPU_MIC_WORKER)
 	{
 		struct starpu_task *task = NULL;
 		STARPU_PTHREAD_MUTEX_LOCK(&data->policy_mutex);
@@ -211,6 +206,9 @@ static struct starpu_task *pop_task_peager_policy(unsigned sched_ctx_id)
 
 	int master = data->master_id[workerid];
 
+	//_STARPU_DEBUG("workerid:%d, master:%d\n",workerid,master);
+
+
 	if (master == workerid)
 	{
 		/* The worker is a master */
@@ -248,9 +246,9 @@ static struct starpu_task *pop_task_peager_policy(unsigned sched_ctx_id)
 		/* Is this a basic worker or a combined worker ? */
 		int nbasic_workers = (int)starpu_worker_get_count();
 		int is_basic_worker = (best_workerid < nbasic_workers);
-
 		if (is_basic_worker)
 		{
+
 			/* The master is alone */
 			return task;
 		}

+ 7 - 2
src/util/starpu_data_cpy.c

@@ -78,8 +78,13 @@ void mp_cpy_kernel(void *descr[], void *cl_arg)
 	void *dst_interface = descr[0];
 	void *src_interface = descr[1];
 
-	STARPU_ASSERT(copy_methods->ram_to_ram);
-	copy_methods->ram_to_ram(src_interface, 0, dst_interface, 0);
+	if(copy_methods->ram_to_ram)
+		copy_methods->ram_to_ram(src_interface, STARPU_MAIN_RAM, dst_interface, STARPU_MAIN_RAM);
+	else if(copy_methods->any_to_any)
+		copy_methods->any_to_any(src_interface, STARPU_MAIN_RAM, dst_interface, STARPU_MAIN_RAM, NULL);
+	else
+		STARPU_ABORT();
+
 }
 
 static starpu_mic_kernel_t mic_cpy_func()

+ 2 - 0
tests/datawizard/acquire_cb_insert.c

@@ -77,6 +77,8 @@ int main(int argc, char **argv)
 	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
+	if(starpu_cpu_worker_get_count() == 0) return STARPU_TEST_SKIPPED;
+
 	/* Declare x */
 	starpu_variable_data_register(&x_handle, STARPU_MAIN_RAM, (uintptr_t)&x, sizeof(x));
 

+ 6 - 2
tests/datawizard/commute.c

@@ -85,7 +85,9 @@ void end(void *descr[], void *_args STARPU_ATTRIBUTE_UNUSED)
 {
 	int *x = (int *)STARPU_VARIABLE_GET_PTR(descr[0]);
 
-	if (codelet_end.modes[0] & STARPU_W)
+	enum starpu_data_access_mode end_mode = *(enum starpu_data_access_mode*) _args;
+
+	if (end_mode & STARPU_W)
 		(*x)++;
 }
 
@@ -105,7 +107,7 @@ static void test(enum starpu_data_access_mode begin_mode, enum starpu_data_acces
 	int ret;
 
 	codelet_begin.modes[0] = begin_mode;
-	codelet_end.modes[0] = end_mode;
+	codelet_end.modes[0] = end_mode;	
 
 	begin_t = starpu_task_create();
 	begin_t->cl = &codelet_begin;
@@ -130,6 +132,8 @@ static void test(enum starpu_data_access_mode begin_mode, enum starpu_data_acces
 	end_t->cl = &codelet_end;
 	end_t->handles[0] = x_handle;
 	end_t->detach = 0;
+	end_t->cl_arg = &end_mode;
+	end_t->cl_arg_size = sizeof(end_mode);
 
 	if (starpu_task_submit(begin_t) == -ENODEV)
 		exit(STARPU_TEST_SKIPPED);

+ 2 - 0
tests/datawizard/data_invalidation.c

@@ -141,6 +141,8 @@ int main(int argc, char **argv)
 	if (ret == -ENODEV) return STARPU_TEST_SKIPPED;
 	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
 
+	if(starpu_cpu_worker_get_count() == 0) return STARPU_TEST_SKIPPED;
+
 	/* The buffer should never be explicitely allocated */
 	starpu_vector_data_register(&v_handle, (uint32_t)-1, (uintptr_t)NULL, VECTORSIZE, sizeof(char));
 

+ 1 - 1
tests/datawizard/interfaces/coo/coo_interface.c

@@ -21,7 +21,7 @@
 #define MATRIX_SIZE (NX*NY)
 
 #if defined(STARPU_USE_CPU) || defined(STAPRU_USE_MIC)
-static void test_coo_cpu_func(void *buffers[], void *args);
+void test_coo_cpu_func(void *buffers[], void *args);
 #endif
 #ifdef STARPU_USE_CUDA
 extern void test_coo_cuda_func(void *buffers[], void *args);

+ 0 - 1
tests/errorcheck/invalid_blocking_calls.c

@@ -47,7 +47,6 @@ static struct starpu_codelet wrong_codelet =
 	.cpu_funcs = {wrong_func, NULL},
 	.cuda_funcs = {wrong_func, NULL},
         .opencl_funcs = {wrong_func, NULL},
-	.cpu_funcs_name = {"wrong_func", NULL},
 	.model = NULL,
 	.nbuffers = 0
 };

+ 1 - 1
tests/loader-cross.sh.in

@@ -6,4 +6,4 @@ top_builddir="@top_builddir@"
 NATIVE=${PWD/\/build_mic\//\/build_host\/}
 DIR="$(dirname "$1")"
 FILE="$(basename "$1")"
-SINK_LD_LIBRARY_PATH="$top_builddir/src/.libs" STARPU_MIC_SINK_PROGRAM_NAME="$DIR/.libs/$FILE" $top_srcdir/build_host/tests/loader "$NATIVE/$1"
+SINK_LD_LIBRARY_PATH="$top_builddir/src/.libs" STARPU_MIC_SINK_PROGRAM_NAME="$DIR/.libs/$FILE" $top_builddir/../build_host/tests/loader "$NATIVE/$1"

+ 1 - 0
tests/main/starpu_init.c

@@ -83,6 +83,7 @@ int main(int argc, char **argv)
 
 	ret = check_cpu(-1, -1, -1, &cpu_init);
 	if (ret) return ret;
+	if (cpu_init == 0) return STARPU_TEST_SKIPPED;
 
 	if (cpu_init >= STARPU_MAXCPUS-5)
 	{

+ 2 - 1
tests/parallel_tasks/explicit_combined_worker.c

@@ -24,7 +24,7 @@
 #define N	1000
 #define VECTORSIZE	1024
 
-static void codelet_null(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
+void codelet_null(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
 {
 	STARPU_SKIP_IF_VALGRIND;
 
@@ -43,6 +43,7 @@ static struct starpu_codelet cl =
 	.type = STARPU_FORKJOIN,
 	.max_parallelism = INT_MAX,
 	.cpu_funcs = {codelet_null, NULL},
+	.cpu_funcs_name = {"codelet_null", NULL},
 	.cuda_funcs = {codelet_null, NULL},
         .opencl_funcs = {codelet_null, NULL},
 	.nbuffers = 1,

+ 2 - 1
tests/parallel_tasks/parallel_kernels.c

@@ -24,7 +24,7 @@
 #define N	1000
 #define VECTORSIZE	1024
 
-static void codelet_null(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
+void codelet_null(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
 {
 	STARPU_SKIP_IF_VALGRIND;
 
@@ -50,6 +50,7 @@ static struct starpu_codelet cl =
 	.max_parallelism = INT_MAX,
 	.cpu_funcs = {codelet_null, NULL},
 	.cuda_funcs = {codelet_null, NULL},
+	.cpu_funcs_name = {"codelet_null", NULL},
         .opencl_funcs = {codelet_null, NULL},
 	.model = &model,
 	.nbuffers = 1,

+ 2 - 1
tests/parallel_tasks/parallel_kernels_spmd.c

@@ -24,7 +24,7 @@
 #define N	1000
 #define VECTORSIZE	1024
 
-static void codelet_null(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
+void codelet_null(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
 {
 	STARPU_SKIP_IF_VALGRIND;
 
@@ -51,6 +51,7 @@ static struct starpu_codelet cl =
 	.type = STARPU_SPMD,
 	.max_parallelism = INT_MAX,
 	.cpu_funcs = {codelet_null, NULL},
+	.cpu_funcs_name = {"codelet_null", NULL},
 	.cuda_funcs = {codelet_null, NULL},
         .opencl_funcs = {codelet_null, NULL},
 	.model = &model,

+ 2 - 1
tests/parallel_tasks/spmd_peager.c

@@ -26,7 +26,7 @@
 starpu_data_handle_t v_handle;
 static unsigned *v;
 
-static void codelet_null(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
+void codelet_null(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
 {
 	STARPU_SKIP_IF_VALGRIND;
 
@@ -49,6 +49,7 @@ static struct starpu_codelet cl =
 	.type = STARPU_SPMD,
 	.max_parallelism = INT_MAX,
 	.cpu_funcs = {codelet_null, NULL},
+	.cpu_funcs_name = {"codelet_null", NULL},
 	.cuda_funcs = {codelet_null, NULL},
         .opencl_funcs = {codelet_null, NULL},
 	.nbuffers = 1,

+ 2 - 1
tests/sched_policies/execute_all_tasks.c

@@ -26,7 +26,7 @@
 
 #define NTASKS           8
 
-static void
+void
 dummy(void *buffers[], void *args)
 {
 	(void) buffers;
@@ -50,6 +50,7 @@ run(struct starpu_sched_policy *p)
 	struct starpu_codelet cl =
 	{
 		.cpu_funcs    = {dummy, NULL},
+		.cpu_funcs_name = {"dummy", NULL},
 		.cuda_funcs   = {dummy, NULL},
 		.opencl_funcs = {dummy, NULL},
 		.nbuffers     = 0