Browse Source

Import MIC/SCC work from Nicolas, Damien, Brice and Ludovic, reworked a bit. Not even compiled on MIC/SCC yet

Samuel Thibault 12 years ago
parent
commit
57ac315b6d
88 changed files with 5907 additions and 92 deletions
  1. 3 1
      AUTHORS
  2. 3 0
      Makefile.am
  3. 283 1
      configure.ac
  4. 11 0
      doc/chapters/advanced-examples.texi
  5. 202 6
      doc/chapters/api.texi
  6. 3 0
      doc/chapters/basic-examples.texi
  7. 1 0
      doc/chapters/perf-optimization.texi
  8. 2 1
      doc/chapters/vector_scal_c.texi
  9. 1 1
      examples/Makefile.am
  10. 15 2
      examples/basic_examples/vector_scal.c
  11. 1 1
      examples/stencil/Makefile.am
  12. 22 0
      include/starpu.h
  13. 5 0
      include/starpu_config.h.in
  14. 8 1
      include/starpu_data.h
  15. 57 2
      include/starpu_data_interfaces.h
  16. 35 0
      include/starpu_mic.h
  17. 10 4
      include/starpu_perfmodel.h
  18. 35 0
      include/starpu_scc.h
  19. 23 0
      include/starpu_sink.h
  20. 13 0
      include/starpu_task.h
  21. 33 1
      include/starpu_worker.h
  22. 28 0
      libstarpu-mic.pc.in
  23. 1 1
      libstarpu.pc.in
  24. 47 3
      src/Makefile.am
  25. 2 0
      src/common/fxt.h
  26. 6 0
      src/core/jobs.h
  27. 51 4
      src/core/perfmodel/perfmodel_bus.c
  28. 44 16
      src/core/perfmodel/perfmodel_history.c
  29. 28 0
      src/core/sched_policy.c
  30. 26 0
      src/core/task.c
  31. 4 0
      src/core/task.h
  32. 389 13
      src/core/topology.c
  33. 1 1
      src/core/topology.h
  34. 219 2
      src/core/workers.c
  35. 14 0
      src/core/workers.h
  36. 5 0
      src/datawizard/coherency.c
  37. 138 0
      src/datawizard/copy_driver.c
  38. 15 0
      src/datawizard/copy_driver.h
  39. 1 0
      src/datawizard/interfaces/bcsr_filters.c
  40. 4 2
      src/datawizard/interfaces/bcsr_interface.c
  41. 1 0
      src/datawizard/interfaces/block_filters.c
  42. 186 2
      src/datawizard/interfaces/block_interface.c
  43. 4 2
      src/datawizard/interfaces/coo_interface.c
  44. 1 0
      src/datawizard/interfaces/csr_filters.c
  45. 6 2
      src/datawizard/interfaces/csr_interface.c
  46. 39 0
      src/datawizard/interfaces/data_interface.c
  47. 10 0
      src/datawizard/interfaces/data_interface.h
  48. 2 0
      src/datawizard/interfaces/matrix_filters.c
  49. 161 0
      src/datawizard/interfaces/matrix_interface.c
  50. 105 2
      src/datawizard/interfaces/multiformat_interface.c
  51. 20 4
      src/datawizard/interfaces/variable_interface.c
  52. 4 0
      src/datawizard/interfaces/vector_filters.c
  53. 10 2
      src/datawizard/interfaces/vector_interface.c
  54. 2 2
      src/datawizard/interfaces/void_interface.c
  55. 35 0
      src/datawizard/malloc.c
  56. 24 1
      src/datawizard/reduction.c
  57. 19 0
      src/debug/traces/starpu_fxt.c
  58. 2 1
      src/drivers/gordon/driver_gordon.c
  59. 116 0
      src/drivers/mic/driver_mic_common.c
  60. 69 0
      src/drivers/mic/driver_mic_common.h
  61. 135 0
      src/drivers/mic/driver_mic_sink.c
  62. 48 0
      src/drivers/mic/driver_mic_sink.h
  63. 749 0
      src/drivers/mic/driver_mic_source.c
  64. 79 0
      src/drivers/mic/driver_mic_source.h
  65. 45 0
      src/drivers/mic/driver_mic_utils.c
  66. 234 0
      src/drivers/mp_common/mp_common.c
  67. 178 0
      src/drivers/mp_common/mp_common.h
  68. 275 0
      src/drivers/mp_common/sink_common.c
  69. 39 0
      src/drivers/mp_common/sink_common.h
  70. 368 0
      src/drivers/mp_common/source_common.c
  71. 63 0
      src/drivers/mp_common/source_common.h
  72. 174 0
      src/drivers/scc/driver_scc_common.c
  73. 50 0
      src/drivers/scc/driver_scc_common.h
  74. 125 0
      src/drivers/scc/driver_scc_sink.c
  75. 38 0
      src/drivers/scc/driver_scc_sink.h
  76. 408 0
      src/drivers/scc/driver_scc_source.c
  77. 56 0
      src/drivers/scc/driver_scc_source.h
  78. 45 0
      src/drivers/scc/driver_scc_utils.c
  79. 6 0
      src/top/starpu_top.c
  80. 3 0
      src/util/execute_on_all.c
  81. 61 8
      src/util/starpu_data_cpy.c
  82. 1 1
      src/util/starpu_insert_task.c
  83. 35 0
      starpu-1.0-mic.pc.in
  84. 1 1
      starpu-1.0.pc.in
  85. 1 1
      starpu-1.1.pc.in
  86. 74 0
      super-configure
  87. 1 0
      tools/Makefile.am
  88. 10 0
      tools/starpu_machine_display.c

+ 3 - 1
AUTHORS

@@ -10,6 +10,8 @@ David Gómez <david_gomez1380@yahoo.com.mx>
 Sylvain Henry <sylvain.henry@inria.fr>
 Sylvain Henry <sylvain.henry@inria.fr>
 Mehdi Juhoor <mjuhoor@gmail.com>
 Mehdi Juhoor <mjuhoor@gmail.com>
 Antoine Lucas <antoine.lucas.33@gmail.com>
 Antoine Lucas <antoine.lucas.33@gmail.com>
+Brice Mortier <brice.mortier@etu.u-bordeaux1.fr>
+Damien Pasqualinotto <dam.pasqualinotto@wanadoo.fr>
 Nguyen Quôc-Dinh <nguyen.quocdinh@gmail.com>
 Nguyen Quôc-Dinh <nguyen.quocdinh@gmail.com>
 Cyril Roelandt <cyril.roelandt@inria.fr>
 Cyril Roelandt <cyril.roelandt@inria.fr>
 Anthony Roy <theanthony33@gmail.com>
 Anthony Roy <theanthony33@gmail.com>
@@ -17,4 +19,4 @@ Ludovic Stordeur <ludovic.stordeur@inria.fr>
 François Tessier <francois.tessier@inria.fr>
 François Tessier <francois.tessier@inria.fr>
 Samuel Thibault <samuel.thibault@labri.fr>
 Samuel Thibault <samuel.thibault@labri.fr>
 Pierre-André Wacrenier <wacrenier@labri.fr>
 Pierre-André Wacrenier <wacrenier@labri.fr>
-Andra Hugo <andra.hugo@inria.fr>
+Andra Hugo <andra.hugo@inria.fr>

+ 3 - 0
Makefile.am

@@ -67,6 +67,9 @@ versinclude_HEADERS = 				\
 	include/starpu_fxt.h			\
 	include/starpu_fxt.h			\
 	include/starpu_cuda.h			\
 	include/starpu_cuda.h			\
 	include/starpu_opencl.h			\
 	include/starpu_opencl.h			\
+	include/starpu_sink.h			\
+	include/starpu_mic.h			\
+	include/starpu_scc.h			\
 	include/starpu_expert.h			\
 	include/starpu_expert.h			\
 	include/starpu_profiling.h		\
 	include/starpu_profiling.h		\
 	include/starpu_bound.h			\
 	include/starpu_bound.h			\

+ 283 - 1
configure.ac

@@ -100,6 +100,31 @@ else
    LC_ALL=C svnversion $srcdir > ./STARPU-REVISION
    LC_ALL=C svnversion $srcdir > ./STARPU-REVISION
 fi
 fi
 
 
+AM_CONDITIONAL([STARPU_CROSS_COMPILING], [test "x$cross_compiling" = "xyes"])
+
+###############################################################################
+#                                                                             #
+#                           MIC device compilation                            #
+#   (Must be done in beginning to change prefix in the whole configuration)   #
+#                                                                             #
+###############################################################################
+
+AC_ARG_ENABLE(mic, [AS_HELP_STRING([--enable-mic],
+	      [use MIC device(s)])], [enable_mic=yes], [enable_mic=no])
+AC_ARG_ENABLE(mic-rma, [AS_HELP_STRING([--disable-mic-rma],
+	      [use MIC RMA transfer])], [], [enable_mic_rma=yes])
+
+if test x$enable_mic = xyes ; then
+	AC_DEFINE(STARPU_USE_MIC, [1], [MIC workers support is enabled])
+fi
+if test x$enable_mic_rma = xyes ; then
+	AC_DEFINE([STARPU_MIC_USE_RMA], [1], [MIC RMA transfer is enable])
+fi
+
+AM_CONDITIONAL([STARPU_USE_MIC], [test "x$enable_mic" = "xyes"])
+
+###############################################################################
+
 AC_PATH_PROGS([STARPU_MS_LIB], [lib])
 AC_PATH_PROGS([STARPU_MS_LIB], [lib])
 AC_ARG_VAR([STARPU_MS_LIB], [Path to Microsoft's Visual Studio `lib' tool])
 AC_ARG_VAR([STARPU_MS_LIB], [Path to Microsoft's Visual Studio `lib' tool])
 AM_CONDITIONAL([STARPU_HAVE_MS_LIB], [test "x$STARPU_MS_LIB" != "x"])
 AM_CONDITIONAL([STARPU_HAVE_MS_LIB], [test "x$STARPU_MS_LIB" != "x"])
@@ -880,6 +905,252 @@ if test x$enable_blocking = xno -a x$enable_simgrid != xyes ; then
 	AC_DEFINE(STARPU_NON_BLOCKING_DRIVERS, [1], [drivers must progress])
 	AC_DEFINE(STARPU_NON_BLOCKING_DRIVERS, [1], [drivers must progress])
 fi
 fi
 
 
+###############################################################################
+#                                                                             #
+#                                 MIC settings                                #
+#                                                                             #
+###############################################################################
+
+AC_MSG_CHECKING(maximum number of MIC devices)
+AC_ARG_ENABLE(maxmicdev, [AS_HELP_STRING([--enable-maxmicdev=<number>],
+			[maximum number of MIC devices])],
+			nmaxmicdev=$enableval, nmaxmicdev=4)
+AC_MSG_RESULT($nmaxmicdev)
+
+AC_DEFINE_UNQUOTED(STARPU_MAXMICDEVS, [$nmaxmicdev],
+	[maximum number of MIC devices])
+
+AC_MSG_CHECKING(maximum number of MIC cores)
+AC_ARG_ENABLE(maxmicdev, [AS_HELP_STRING([--enable-maxmiccore=<number>],
+			[maximum number of MIC cores])],
+			nmaxmiccore=$enableval, nmaxmiccore=128)
+AC_MSG_RESULT($nmaxmiccore)
+
+AC_DEFINE_UNQUOTED(STARPU_MAXMICCORES, [$nmaxmiccore],
+	[maximum number of MIC cores])
+
+AC_ARG_WITH(coi-dir,
+	[AS_HELP_STRING([--with-coi-dir=<path>],
+	[specify the MIC's COI installation directory])],
+	[coi_dir="$withval"],
+	[coi_dir=no])
+
+AC_ARG_WITH(coi-include-dir,
+	[AS_HELP_STRING([--with-coi-include-dir=<path>],
+	[specify where the MIC's COI headers are installed])],
+	[coi_include_dir="$withval"],
+	[coi_include_dir=no])
+
+AC_ARG_WITH(coi-lib-dir,
+	[AS_HELP_STRING([--with-coi-lib-dir=<path>],
+	[specify where the MIC's COI libraries are installed])],
+	[coi_lib_dir="$withval"],
+	[coi_lib_dir=no])
+
+AC_DEFUN([STARPU_CHECK_COI_RUNTIME],
+[
+    __coi_dir=$1
+    __coi_include_dir=$2
+    __coi_lib_dir=$3
+    __coi_lib_name=$4
+
+    if test "$__coi_dir" != "no" -a "$__coi_dir" != "" ; then
+	AC_MSG_CHECKING(whether MIC's COI runtime is available in $__coi_dir)
+    else
+	AC_MSG_CHECKING(whether MIC's COI runtime is available)
+    fi
+    AC_MSG_RESULT()
+
+    if test "$__coi_include_dir" = "no" -a "$__coi_dir" != "no" ; then
+        __coi_include_dir="${__coi_dir}/include"
+    fi
+    if test "$__coi_lib_dir" = "no" -a "$__coi_dir" != "no" ; then
+        __coi_lib_dir="${__coi_dir}/lib"
+    fi
+
+    SAVED_CPPFLAGS="$CPPFLAGS"
+    SAVED_LDFLAGS="$LDFLAGS"
+
+    if test "$__coi_include_dir" != "no" ; then
+        CPPFLAGS="${CPPFLAGS} -I$__coi_include_dir"
+    fi
+    if test "$__coi_lib_dir" != "no" ; then
+	LDFLAGS="${LDFLAGS} -L$__coi_lib_dir"
+    fi
+
+    AC_CHECK_HEADER([source/COIEngine_source.h],[have_valid_coi=yes],[have_valid_coi=no])
+
+    if test "$have_valid_coi" = "yes" ; then
+	AC_HAVE_LIBRARY([$__coi_lib_name],[have_valid_coi=yes],[have_valid_coi=no])
+
+        if test "$have_valid_coi" = "no" ; then
+            if test "$3" = "no" -a "$__coi_dir" != "no" ; then
+		# ${__coi_dir}/lib didn't work, let's try with lib64
+                __coi_lib_dir="$__coi_dir/lib64"
+		LDFLAGS="${SAVED_LDFLAGS} -L$__coi_lib_dir"
+	        AC_HAVE_LIBRARY([$__coi_lib_name],[have_valid_coi=yes],[have_valid_coi=no])
+            fi
+        fi
+    fi
+
+    if test "$have_valid_coi" = "yes" -a "$__coi_include_dir" != "no"; then
+        STARPU_COI_CPPFLAGS="-I$__coi_include_dir"
+    fi
+
+    if test "$have_valid_coi" = "yes" -a "$__coi_lib_dir" != "no"; then
+        STARPU_COI_LDFLAGS="-L$__coi_lib_dir -l$__coi_lib_name"
+    fi
+
+    CPPFLAGS="${SAVED_CPPFLAGS}"
+    LDFLAGS="${SAVED_LDFLAGS}"
+])
+
+if test x$enable_mic = xyes ; then
+
+    STARPU_CHECK_COI_RUNTIME($coi_dir, $coi_include_dir, $coi_lib_dir, "coi_host")
+
+    # Host runtime is not compatible, we are probably cross-compiling
+    # Let's have a look for the device runtime which lib has a different name
+    if test "$have_valid_coi" = "no" ; then
+	    STARPU_CHECK_COI_RUNTIME($coi_dir, $coi_include_dir, $coi_lib_dir, "coi_device")
+    fi
+
+    if test "$have_valid_coi" = "no" ; then
+	AC_MSG_ERROR([cannot find MIC's COI runtime])
+    fi
+
+    AC_SUBST(STARPU_COI_CPPFLAGS)
+    AC_SUBST(STARPU_COI_LDFLAGS)
+fi
+
+###############################################################################
+#                                                                             #
+#                                 SCC settings                                #
+#                                                                             #
+###############################################################################
+
+AC_ARG_ENABLE([rcce], [AS_HELP_STRING([--disable-rcce],
+			  [do not use SCC device(s)])], [], enable_rcce=maybe)
+
+nmaxsccdev=47
+AC_DEFINE_UNQUOTED(STARPU_MAXSCCDEVS, [$nmaxsccdev], [maximum number of SCC devices])
+
+AC_ARG_WITH(rcce-dir,
+			[AS_HELP_STRING([--with-rcce-dir=<path>],
+							[specify RCCE installation directory])],
+			[
+				rcce_dir="$withval"
+				enable_rcce=yes
+			],
+			rcce_dir=no)
+
+AC_ARG_WITH(rcce-include-dir,
+			[AS_HELP_STRING([--with-rcce-include-dir=<path>],
+							[specify where RCCE headers are installed])],
+			[
+				rcce_include_dir="$withval"
+				enable_rcce=yes
+			],
+			rcce_include_dir=no)
+
+AC_ARG_WITH(rcce-lib-dir,
+			[AS_HELP_STRING([--with-rcce-lib-dir=<path>],
+							[specify where RCCE libraries are installed])],
+			[
+			 	rcce_lib_dir="$withval"
+			 	enable_rcce=yes
+			],
+			rcce_lib_dir=no)
+
+if test x$enable_rcce = xyes -o x$enable_rcce = xmaybe ; then
+	have_valid_rcce=yes
+
+	SAVED_LDFLAGS="${LDFLAGS}"
+	SAVED_CPPFLAGS="${CPPFLAGS}"
+	SAVED_LIBS="${LIBS}"
+
+	if test x$rcce_include_dir != xno ; then
+		STARPU_RCCE_CPPFLAGS="-I${rcce_include_dir}"
+	elif test x$rcce_dir != xno ; then
+		STARPU_RCCE_CPPFLAGS="-I${rcce_dir}/include"
+	fi
+
+	CPPFLAGS="${CPPFLAGS} ${STARPU_RCCE_CPPFLAGS}"
+	AC_CHECK_HEADER([RCCE.h], [], [have_valid_rcce=no])
+
+	if test x$rcce_lib_dir != xno ; then
+		STARPU_RCCE_LDFLAGS="-L${rcce_lib_dir}"
+	elif test x$rcce_dir != xno ; then
+		STARPU_RCCE_LDFLAGS="-L${rcce_lib}/lib"
+	fi
+
+	LDFLAGS="${LDFLAGS} ${STARPU_RCCE_LDFLAGS}"
+	AC_CHECK_LIB([RCCE_bigflags_nongory_nopwrmgmt], [RCCE_init], [], [have_valid_rcce=no])
+
+	# in case RCCE was explicitely required, but is not available, this is an error
+	if test x$enable_rcce = xyes -a x$have_valid_rcce = xno ; then
+		AC_MSG_ERROR([cannot find RCCE library])
+	fi
+
+	if test x$have_valid_rcce = xyes ; then
+		STARPU_RCCE_CPPFLAGS="${STARPU_RCCE_CPPFLAGS} -DSCC"
+		STARPU_RCCE_LDFLAGS="${STARPU_RCCE_LDFLAGS} -lRCCE_bigflags_nongory_nopwrmgmt -ldl"
+
+		AC_DEFINE(STARPU_USE_SCC, [1], [SCC support is enabled])
+
+		AC_SUBST(STARPU_RCCE_CFLAGS)
+		AC_SUBST(STARPU_RCCE_CPPFLAGS)
+		AC_SUBST(STARPU_RCCE_LDFLAGS)
+	fi
+
+	LDFLAGS="${SAVED_LDFLAGS}"
+	CPPFLAGS="${SAVED_CPPFLAGS}"
+	LIBS="${SAVED_LIBS}"
+
+	enable_rcce=$have_valid_rcce
+fi
+
+AM_CONDITIONAL(STARPU_USE_SCC, test x$enable_rcce = xyes)
+
+AC_MSG_CHECKING(whether RCCE should be used)
+AC_MSG_RESULT($enable_rcce)
+
+
+###############################################################################
+#                                                                             #
+#                             MP Common settings                              #
+#                                                                             #
+###############################################################################
+
+AM_CONDITIONAL([STARPU_USE_MP], [test "x$enable_mic" = "xyes" -o "x$enable_mpi" = "xyes" -o "x$enable_rcce" = "xyes"])
+
+AC_ARG_ENABLE([export-dynamic], [AS_HELP_STRING([--disable-export-dynamic],
+			  [Prevent the linker from adding all symbols to the dynamic symbol table])], [], [])
+
+if test x$enable_mic = xyes -o x$enable_mpi = xyes -o x$enable_rcce = xyes ; then
+	AC_DEFINE(STARPU_USE_MP, [1], [Message-passing SINKs support
+		  is enabled])
+
+	if test x$enable_export_dynamic != xno ; then
+		STARPU_EXPORT_DYNAMIC="-rdynamic"
+	fi
+fi
+
+AC_SUBST(STARPU_EXPORT_DYNAMIC)
+
+# Computes the maximum number of different kernels a message-passing sink
+# can lookup for and launch.
+AC_MSG_CHECKING(Maximum number of message-passing kernels)
+AC_ARG_ENABLE(maxmpkernels, [AS_HELP_STRING([
+	      -enable-maxmpkernels=<number>],
+	      [maximum number of kernels a message-passing sink can lookup
+	      for and execute])],
+	      maxmpkernels=$enableval, maxmpkernels=10)
+AC_MSG_RESULT($maxmpkernels)
+AC_DEFINE_UNQUOTED(STARPU_MAXMPKERNELS, [$maxmpkernels],
+		[maximum number of message-passing kernels])
+
+###############################################################################
 
 
 ###############################################################################
 ###############################################################################
 #                                                                             #
 #                                                                             #
@@ -1074,6 +1345,7 @@ if test x$enable_simgrid = xyes ; then
 	maxnodes=16
 	maxnodes=16
 else
 else
 	# We have one memory node shared by all CPU workers, one node per GPU
 	# We have one memory node shared by all CPU workers, one node per GPU
+	# and per MIC device
 	nodes=1
 	nodes=1
 	if test x$enable_cuda = xyes ; then
 	if test x$enable_cuda = xyes ; then
 		# we could have used nmaxcudadev + 1, but this would certainly give an
 		# we could have used nmaxcudadev + 1, but this would certainly give an
@@ -1085,6 +1357,14 @@ else
 		# odd number.
 		# odd number.
 		nodes=`expr $nodes + $nmaxopencldev`
 		nodes=`expr $nodes + $nmaxopencldev`
 	fi
 	fi
+	if test x$enable_mic = xyes ; then
+		maxnodes=`expr $maxnodes + $nmaxmicdev`
+	fi
+	if test x$enable_rcce = xyes ; then
+		# Only 1 memory node for the shared memory.
+		maxnodes=`expr $maxnodes + 1`
+	fi
+
 	# set maxnodes to the next power of 2 greater than nodes
 	# set maxnodes to the next power of 2 greater than nodes
 	maxnodes=1
 	maxnodes=1
 	while test "$maxnodes" -lt "$nodes"
 	while test "$maxnodes" -lt "$nodes"
@@ -1136,7 +1416,7 @@ AC_CHECK_FUNCS([clock_gettime])
 
 
 # Compute the maximum number of workers (we round it to 16 for alignment
 # Compute the maximum number of workers (we round it to 16 for alignment
 # purposes).
 # purposes).
-nmaxworkers=`expr 16 \* \( \( $maxcpus + $nmaxcudadev + $nmaxopencldev + 15 \) / 16 \) `
+nmaxworkers=`expr 16 \* \( \( $maxcpus + $nmaxcudadev + $nmaxopencldev + $nmaxmiccore + $nmaxsccdev + 15 \) / 16 \) `
 AC_MSG_CHECKING(Maximum number of workers)
 AC_MSG_CHECKING(Maximum number of workers)
 AC_MSG_RESULT($nmaxworkers)
 AC_MSG_RESULT($nmaxworkers)
 AC_DEFINE_UNQUOTED(STARPU_NMAXWORKERS, [$nmaxworkers], [Maximum number of workers])
 AC_DEFINE_UNQUOTED(STARPU_NMAXWORKERS, [$nmaxworkers], [Maximum number of workers])
@@ -1936,6 +2216,8 @@ AC_MSG_NOTICE([
 	CPUs   enabled: $enable_cpu
 	CPUs   enabled: $enable_cpu
 	CUDA   enabled: $enable_cuda
 	CUDA   enabled: $enable_cuda
 	OpenCL enabled: $enable_opencl
 	OpenCL enabled: $enable_opencl
+	SCC    enabled: $enable_rcce
+	MIC    enabled: $enable_mic
 
 
 	Compile-time limits
 	Compile-time limits
 	(change these with --enable-maxcpus, --enable-maxcudadev,
 	(change these with --enable-maxcpus, --enable-maxcudadev,

+ 11 - 0
doc/chapters/advanced-examples.texi

@@ -61,6 +61,7 @@ void scal_sse_func(void *buffers[], void *cl_arg)
 struct starpu_codelet cl = @{
 struct starpu_codelet cl = @{
     .where = STARPU_CPU,
     .where = STARPU_CPU,
     .cpu_funcs = @{ scal_cpu_func, scal_sse_func, NULL @},
     .cpu_funcs = @{ scal_cpu_func, scal_sse_func, NULL @},
+    .cpu_funcs_name = @{ "scal_cpu_func", "scal_sse_func", NULL @},
     .nbuffers = 1,
     .nbuffers = 1,
     .modes = @{ STARPU_RW @}
     .modes = @{ STARPU_RW @}
 @};
 @};
@@ -100,6 +101,7 @@ struct starpu_codelet cl = @{
     .where = STARPU_CPU|STARPU_CUDA,
     .where = STARPU_CPU|STARPU_CUDA,
     .can_execute = can_execute,
     .can_execute = can_execute,
     .cpu_funcs = @{ cpu_func, NULL @},
     .cpu_funcs = @{ cpu_func, NULL @},
+    .cpu_funcs_name = @{ "cpu_func", NULL @},
     .cuda_funcs = @{ gpu_func, NULL @}
     .cuda_funcs = @{ gpu_func, NULL @}
     .nbuffers = 1,
     .nbuffers = 1,
     .modes = @{ STARPU_RW @}
     .modes = @{ STARPU_RW @}
@@ -146,6 +148,7 @@ struct starpu_codelet cl = @{
     .where = STARPU_CPU|STARPU_CUDA,
     .where = STARPU_CPU|STARPU_CUDA,
     .can_execute = can_execute,
     .can_execute = can_execute,
     .cpu_funcs = @{ cpu_func, NULL @},
     .cpu_funcs = @{ cpu_func, NULL @},
+    .cpu_funcs_name = @{ "cpu_func", NULL @},
     .cuda_funcs = @{ scal_gpu_13, scal_gpu_20, NULL @},
     .cuda_funcs = @{ scal_gpu_13, scal_gpu_20, NULL @},
     .nbuffers = 1,
     .nbuffers = 1,
     .modes = @{ STARPU_RW @}
     .modes = @{ STARPU_RW @}
@@ -359,6 +362,7 @@ static struct starpu_perfmodel mult_perf_model = @{
 struct starpu_codelet cl = @{
 struct starpu_codelet cl = @{
     .where = STARPU_CPU,
     .where = STARPU_CPU,
     .cpu_funcs = @{ cpu_mult, NULL @},
     .cpu_funcs = @{ cpu_mult, NULL @},
+    .cpu_funcs_name = @{ "cpu_mult", NULL @},
     .nbuffers = 3,
     .nbuffers = 3,
     .modes = @{ STARPU_R, STARPU_R, STARPU_W @},
     .modes = @{ STARPU_R, STARPU_R, STARPU_W @},
     /* for the scheduling policy to be able to use performance models */
     /* for the scheduling policy to be able to use performance models */
@@ -520,6 +524,7 @@ void func_cpu(void *descr[], void *_args)
 struct starpu_codelet mycodelet = @{
 struct starpu_codelet mycodelet = @{
         .where = STARPU_CPU,
         .where = STARPU_CPU,
         .cpu_funcs = @{ func_cpu, NULL @},
         .cpu_funcs = @{ func_cpu, NULL @},
+        .cpu_funcs_name = @{ "func_cpu", NULL @},
         .nbuffers = 2,
         .nbuffers = 2,
         .modes = @{ STARPU_RW, STARPU_RW @}
         .modes = @{ STARPU_RW, STARPU_RW @}
 @};
 @};
@@ -623,6 +628,7 @@ the codelets for initialization and reduction:
 struct starpu_codelet bzero_variable_cl =
 struct starpu_codelet bzero_variable_cl =
 @{
 @{
         .cpu_funcs = @{ bzero_variable_cpu, NULL @},
         .cpu_funcs = @{ bzero_variable_cpu, NULL @},
+        .cpu_funcs_name = @{ "bzero_variable_cpu", NULL @},
         .cuda_funcs = @{ bzero_variable_cuda, NULL @},
         .cuda_funcs = @{ bzero_variable_cuda, NULL @},
         .nbuffers = 1,
         .nbuffers = 1,
 @}
 @}
@@ -645,6 +651,7 @@ static void accumulate_variable_cuda(void *descr[], void *cl_arg)
 struct starpu_codelet accumulate_variable_cl =
 struct starpu_codelet accumulate_variable_cl =
 @{
 @{
         .cpu_funcs = @{ accumulate_variable_cpu, NULL @},
         .cpu_funcs = @{ accumulate_variable_cpu, NULL @},
+        .cpu_funcs_name = @{ "accumulate_variable_cpu", NULL @},
         .cuda_funcs = @{ accumulate_variable_cuda, NULL @},
         .cuda_funcs = @{ accumulate_variable_cuda, NULL @},
         .nbuffers = 1,
         .nbuffers = 1,
 @}
 @}
@@ -827,6 +834,7 @@ static struct starpu_codelet cl =
     .type = STARPU_FORKJOIN,
     .type = STARPU_FORKJOIN,
     .max_parallelism = INT_MAX,
     .max_parallelism = INT_MAX,
     .cpu_funcs = @{scal_cpu_func, NULL@},
     .cpu_funcs = @{scal_cpu_func, NULL@},
+    .cpu_funcs_name = @{"scal_cpu_func", NULL@},
     .nbuffers = 1,
     .nbuffers = 1,
 @};
 @};
 @end smallexample
 @end smallexample
@@ -870,6 +878,7 @@ static struct starpu_codelet cl =
     .type = STARPU_SPMD,
     .type = STARPU_SPMD,
     .max_parallelism = INT_MAX,
     .max_parallelism = INT_MAX,
     .cpu_funcs = @{ func, NULL @},
     .cpu_funcs = @{ func, NULL @},
+    .cpu_funcs_name = @{ "func", NULL @},
     .nbuffers = 1,
     .nbuffers = 1,
 @}
 @}
 @end smallexample
 @end smallexample
@@ -977,6 +986,7 @@ void opencl_to_cpu_func(void *buffers[], void *args);
 struct starpu_codelet opencl_to_cpu_cl = @{
 struct starpu_codelet opencl_to_cpu_cl = @{
     .where = STARPU_CPU,
     .where = STARPU_CPU,
     .cpu_funcs = @{ opencl_to_cpu_func, NULL @},
     .cpu_funcs = @{ opencl_to_cpu_func, NULL @},
+    .cpu_funcs_name = @{ "opencl_to_cpu_func", NULL @},
     .nbuffers = 1,
     .nbuffers = 1,
     .modes = @{ STARPU_RW @}
     .modes = @{ STARPU_RW @}
 @};
 @};
@@ -1287,6 +1297,7 @@ struct starpu_codelet dummy_big_cl =
 	.cuda_funcs = @{dummy_big_kernel, NULL@},
 	.cuda_funcs = @{dummy_big_kernel, NULL@},
 	.opencl_funcs = @{dummy_big_kernel, NULL@},
 	.opencl_funcs = @{dummy_big_kernel, NULL@},
 	.cpu_funcs = @{dummy_big_kernel, NULL@},
 	.cpu_funcs = @{dummy_big_kernel, NULL@},
+	.cpu_funcs_name = @{"dummy_big_kernel", NULL@},
 	.nbuffers = STARPU_NMAXBUFS+1,
 	.nbuffers = STARPU_NMAXBUFS+1,
 	.dyn_modes = modes
 	.dyn_modes = modes
 @};
 @};

+ 202 - 6
doc/chapters/api.texi

@@ -24,6 +24,8 @@
 * Theoretical lower bound on execution time API::
 * Theoretical lower bound on execution time API::
 * CUDA extensions::
 * CUDA extensions::
 * OpenCL extensions::
 * OpenCL extensions::
+* MIC extensions::
+* SCC extensions::
 * Miscellaneous helpers::
 * Miscellaneous helpers::
 * FXT Support::
 * FXT Support::
 * FFT Support::
 * FFT Support::
@@ -104,6 +106,14 @@ be specified with the @code{STARPU_NCUDA} environment variable.
 This is the number of OpenCL devices that StarPU can use. This can
 This is the number of OpenCL devices that StarPU can use. This can
 also be specified with the @code{STARPU_NOPENCL} environment variable.
 also be specified with the @code{STARPU_NOPENCL} environment variable.
 
 
+@item @code{int nmic} (default = -1)
+This is the number of MIC devices that StarPU can use. This can
+also be specified with the @code{STARPU_NMIC} environment variable.
+
+@item @code{int nscc} (default = -1)
+This is the number of SCC devices that StarPU can use. This can
+also be specified with the @code{STARPU_NSCC} environment variable.
+
 @item @code{unsigned use_explicit_workers_bindid} (default = 0)
 @item @code{unsigned use_explicit_workers_bindid} (default = 0)
 If this flag is set, the @code{workers_bindid} array indicates where the
 If this flag is set, the @code{workers_bindid} array indicates where the
 different workers are bound, otherwise StarPU automatically selects where to
 different workers are bound, otherwise StarPU automatically selects where to
@@ -139,6 +149,26 @@ the @code{STARPU_WORKERS_OPENCLID} environment variable.
 If the @code{use_explicit_workers_opencl_gpuid} flag is set, this array
 If the @code{use_explicit_workers_opencl_gpuid} flag is set, this array
 contains the logical identifiers of the OpenCL devices to be used.
 contains the logical identifiers of the OpenCL devices to be used.
 
 
+@item @code{unsigned use_explicit_workers_mic_gpuid} (default = 0)
+If this flag is set, the MIC workers will be attached to the MIC devices
+specified in the @code{workers_mic_gpuid} array. Otherwise, StarPU affects
+the MIC devices in a round-robin fashion. This can also be specified with
+the @code{STARPU_WORKERS_MICID} environment variable.
+
+@item @code{unsigned workers_mic_gpuid[STARPU_NMAXWORKERS]}
+If the @code{use_explicit_workers_mic_gpuid} flag is set, this array
+contains the logical identifiers of the MIC devices to be used.
+
+@item @code{unsigned use_explicit_workers_scc_gpuid} (default = 0)
+If this flag is set, the SCC workers will be attached to the SCC devices
+specified in the @code{workers_scc_gpuid} array. Otherwise, StarPU affects
+the SCC devices in a round-robin fashion. This can also be specified with
+the @code{STARPU_WORKERS_SCCID} environment variable.
+
+@item @code{unsigned workers_scc_gpuid[STARPU_NMAXWORKERS]}
+If the @code{use_explicit_workers_scc_gpuid} flag is set, this array
+contains the logical identifiers of the SCC devices to be used.
+
 @item @code{int calibrate} (default = 0)
 @item @code{int calibrate} (default = 0)
 If this flag is set, StarPU will calibrate the performance models when
 If this flag is set, StarPU will calibrate the performance models when
 executing tasks. If this value is equal to @code{-1}, the default value is
 executing tasks. If this value is equal to @code{-1}, the default value is
@@ -162,6 +192,11 @@ task scheduler will however still however still try varying combined worker
 sizes to look for the most efficient ones.
 sizes to look for the most efficient ones.
 This can also be specified with the @code{STARPU_SINGLE_COMBINED_WORKER} environment variable.
 This can also be specified with the @code{STARPU_SINGLE_COMBINED_WORKER} environment variable.
 
 
+@item @code{mic_sink_program_path} (default = NULL)
+Path to the program to execute on the MIC device, compiled for MIC
+architecture. When set to NULL, StarPU automatically looks next to the host
+program location.
+
 @item @code{int disable_asynchronous_copy} (default = 0)
 @item @code{int disable_asynchronous_copy} (default = 0)
 This flag should be set to 1 to disable asynchronous copies between
 This flag should be set to 1 to disable asynchronous copies between
 CPUs and all accelerators. This can also be specified with the
 CPUs and all accelerators. This can also be specified with the
@@ -223,6 +258,12 @@ Upon successful completion, this function returns 0. Otherwise, @code{-ENODEV}
 indicates that no worker was available (so that StarPU was not initialized).
 indicates that no worker was available (so that StarPU was not initialized).
 @end deftypefun
 @end deftypefun
 
 
+@deftypefun int starpu_initialize ({struct starpu_conf *}@var{conf}, int @var{argc}, {char ***}@var{argv})
+This is the same as @code{starpu_init}, but also takes the @code{argc} and
+@code{argv} as gotten by the application. This is needed for MIC and SCC
+execution so that instances of StarPU can know whether they are slaves or masters.
+@end deftypefun
+
 @deftypefun int starpu_conf_init ({struct starpu_conf *}@var{conf})
 @deftypefun int starpu_conf_init ({struct starpu_conf *}@var{conf})
 This function initializes the @var{conf} structure passed as argument
 This function initializes the @var{conf} structure passed as argument
 with the default values. In case some configuration parameters are already
 with the default values. In case some configuration parameters are already
@@ -319,6 +360,8 @@ The different values are:
 @item @code{STARPU_CPU_WORKER}
 @item @code{STARPU_CPU_WORKER}
 @item @code{STARPU_CUDA_WORKER}
 @item @code{STARPU_CUDA_WORKER}
 @item @code{STARPU_OPENCL_WORKER}
 @item @code{STARPU_OPENCL_WORKER}
+@item @code{STARPU_MIC_WORKER}
+@item @code{STARPU_SCC_WORKER}
 @end table
 @end table
 @end deftp
 @end deftp
 
 
@@ -348,6 +391,20 @@ This function returns the number of OpenCL devices controlled by StarPU. The ret
 value should be at most @code{STARPU_MAXOPENCLDEVS}.
 value should be at most @code{STARPU_MAXOPENCLDEVS}.
 @end deftypefun
 @end deftypefun
 
 
+@deftypefun unsigned starpu_mic_worker_get_count (void)
+This function returns the number of MIC workers controlled by StarPU.
+@end deftypefun
+
+@deftypefun unsigned starpu_mic_device_get_count (void)
+This function returns the number of MIC devices controlled by StarPU. The returned
+value should be at most @code{STARPU_MAXMICDEVS}.
+@end deftypefun
+
+@deftypefun unsigned starpu_scc_worker_get_count (void)
+This function returns the number of SCC devices controlled by StarPU. The returned
+value should be at most @code{STARPU_MAXSCCDEVS}.
+@end deftypefun
+
 @deftypefun int starpu_worker_get_id (void)
 @deftypefun int starpu_worker_get_id (void)
 This function returns the identifier of the current worker, i.e the one associated to the calling
 This function returns the identifier of the current worker, i.e the one associated to the calling
 thread. The returned value is either -1 if the current context is not a StarPU
 thread. The returned value is either -1 if the current context is not a StarPU
@@ -420,6 +477,9 @@ todo
 @item @code{STARPU_CPU_RAM}
 @item @code{STARPU_CPU_RAM}
 @item @code{STARPU_CUDA_RAM}
 @item @code{STARPU_CUDA_RAM}
 @item @code{STARPU_OPENCL_RAM}
 @item @code{STARPU_OPENCL_RAM}
+@item @code{STARPU_MIC_RAM}
+@item @code{STARPU_SCC_RAM}
+@item @code{STARPU_SCC_SHM}
 @end table
 @end table
 @end deftp
 @end deftp
 
 
@@ -1364,8 +1424,8 @@ be useful to provide more specific method in case of e.g. available particular
 CUDA or OpenCL support.
 CUDA or OpenCL support.
 
 
 @table @asis
 @table @asis
-@item @code{int (*@{ram,cuda,opencl@}_to_@{ram,cuda,opencl@})(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)}
-These 12 functions define how to copy data from the @var{src_interface}
+@item @code{int (*@{ram,cuda,opencl,mic@}_to_@{ram,cuda,opencl,mic@})(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)}
+These 14 functions define how to copy data from the @var{src_interface}
 interface on the @var{src_node} node to the @var{dst_interface} interface
 interface on the @var{src_node} node to the @var{dst_interface} interface
 on the @var{dst_node} node. They return 0 on success.
 on the @var{dst_node} node. They return 0 on success.
 
 
@@ -1386,6 +1446,22 @@ Must return 0 if the transfer was actually completed completely synchronously,
 or -EAGAIN if at least some transfers are still ongoing and should be awaited
 or -EAGAIN if at least some transfers are still ongoing and should be awaited
 for by the core.
 for by the core.
 
 
+@item @code{int (*@{ram,mic@}_to_@{ram,mic@}_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)}
+These 2 functions (@code{ram_to_ram} and @code{mic_to_mic} are not among them) define how to copy
+data from the @var{src_interface} interface on the @var{src_node} node to the
+@var{dst_interface} interface on the @var{dst_node} node.
+Must return 0 if the transfer was actually completed completely synchronously,
+or -EAGAIN if at least some transfers are still ongoing and should be awaited
+for by the core.
+
+@item @code{int (*@{src,sink@}_to_@{src,sink@}_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)}
+These 3 functions (@code{src_to_src} is not among them) define how to copy
+data from the @var{src_interface} interface on the @var{src_node} node to the
+@var{dst_interface} interface on the @var{dst_node} node.
+Must return 0 if the transfer was actually completed completely synchronously,
+or -EAGAIN if at least some transfers are still ongoing and should be awaited
+for by the core.
+
 @item @code{int (*any_to_any)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data)}
 @item @code{int (*any_to_any)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data)}
 Define how to copy data from the @var{src_interface} interface on the
 Define how to copy data from the @var{src_interface} interface on the
 @var{src_node} node to the @var{dst_interface} interface on the @var{dst_node}
 @var{src_node} node to the @var{dst_interface} interface on the @var{dst_node}
@@ -1729,6 +1805,24 @@ pointer to a codelet which converts from CPU to CUDA
 
 
 @item @code{struct starpu_codelet *cuda_to_cpu_cl}
 @item @code{struct starpu_codelet *cuda_to_cpu_cl}
 pointer to a codelet which converts from CUDA to CPU
 pointer to a codelet which converts from CUDA to CPU
+
+@item @code{size_t mic_elemsize}
+the size of each element on MIC devices,
+
+@item @code{struct starpu_codelet *cpu_to_mic_cl}
+pointer to a codelet which converts from CPU to MIC
+
+@item @code{struct starpu_codelet *mic_to_cpu_cl}
+pointer to a codelet which converts from MIC to CPU
+
+@item @code{size_t scc_elemsize}
+the size of each element on SCC devices,
+
+@item @code{struct starpu_codelet *cpu_to_scc_cl}
+pointer to a codelet which converts from CPU to SCC
+
+@item @code{struct starpu_codelet *scc_to_cpu_cl}
+pointer to a codelet which converts from SCC to CPU
 @end table
 @end table
 @end deftp
 @end deftp
 
 
@@ -1791,7 +1885,19 @@ processing unit.
 
 
 @defmac STARPU_OPENCL
 @defmac STARPU_OPENCL
 This macro is used when setting the field @code{where} of a @code{struct
 This macro is used when setting the field @code{where} of a @code{struct
-starpu_codelet} to specify the codelet may be executed on a OpenCL
+starpu_codelet} to specify the codelet may be executed on an OpenCL
+processing unit.
+@end defmac
+
+@defmac STARPU_MIC
+This macro is used when setting the field @code{where} of a @code{struct
+starpu_codelet} to specify the codelet may be executed on a MIC
+processing unit.
+@end defmac
+
+@defmac STARPU_SCC
+This macro is used when setting the field @code{where} of a @code{struct
+starpu_codelet} to specify the codelet may be executed on an SCC
 processing unit.
 processing unit.
 @end defmac
 @end defmac
 
 
@@ -1864,6 +1970,12 @@ If the @code{where} field is set, then the @code{cpu_funcs} field is
 ignored if @code{STARPU_CPU} does not appear in the @code{where}
 ignored if @code{STARPU_CPU} does not appear in the @code{where}
 field, it must be non-null otherwise.
 field, it must be non-null otherwise.
 
 
+@item @code{char * cpu_funcs_name[STARPU_MAXIMPLEMENTATIONS]} (optional)
+Is an array of strings which provide the name of the CPU functions referenced in
+the @code{cpu_funcs} array. This can be used when running on MIC devices or the
+SCC platform, for StarPU to simply look up the MIC function implementation
+through its name.
+
 @item @code{starpu_cuda_func_t cuda_func} (optional)
 @item @code{starpu_cuda_func_t cuda_func} (optional)
 This field has been made deprecated. One should use instead the
 This field has been made deprecated. One should use instead the
 @code{cuda_funcs} field.
 @code{cuda_funcs} field.
@@ -1891,6 +2003,28 @@ If the @code{where} field is set, then the @code{opencl_funcs} field
 is ignored if @code{STARPU_OPENCL} does not appear in the @code{where}
 is ignored if @code{STARPU_OPENCL} does not appear in the @code{where}
 field, it must be non-null otherwise.
 field, it must be non-null otherwise.
 
 
+@item @code{starpu_mic_func_t mic_funcs[STARPU_MAXIMPLEMENTATIONS]} (optional)
+Is an array of function pointers to a function which returns the MIC
+implementation of the codelet.
+It must be terminated by a NULL value.
+The functions prototype must be:
+@code{starpu_mic_kernel_t mic_func(struct starpu_codelet *cl, unsigned nimpl);}.
+If the @code{where} field is set, then the @code{mic_funcs} field
+is ignored if @code{STARPU_MIC} does not appear in the @code{where}
+field. It can be null if @code{cpu_funcs_name} is non-NULL, in which case StarPU
+will simply make a symbol lookup to get the implementation.
+
+@item @code{starpu_scc_func_t scc_funcs[STARPU_MAXIMPLEMENTATIONS]} (optional)
+Is an array of function pointers to a function which returns the SCC
+implementation of the codelet.
+It must be terminated by a NULL value.
+The functions prototype must be:
+@code{starpu_scc_kernel_t scc_func(struct starpu_codelet *cl, unsigned nimpl);}.
+If the @code{where} field is set, then the @code{scc_funcs} field
+is ignored if @code{STARPU_SCC} does not appear in the @code{where}
+field. It can be null if @code{cpu_funcs_name} is non-NULL, in which case StarPU
+will simply make a symbol lookup to get the implementation.
+
 @item @code{unsigned nbuffers}
 @item @code{unsigned nbuffers}
 Specifies the number of arguments taken by the codelet. These arguments are
 Specifies the number of arguments taken by the codelet. These arguments are
 managed by the DSM and are accessed from the @code{void *buffers[]}
 managed by the DSM and are accessed from the @code{void *buffers[]}
@@ -2287,6 +2421,7 @@ executing. It thus does not include tasks waiting for dependencies.
 This function returns the task currently executed by the worker, or
 This function returns the task currently executed by the worker, or
 NULL if it is called either from a thread that is not a task or simply
 NULL if it is called either from a thread that is not a task or simply
 because there is no task being executed at the moment.
 because there is no task being executed at the moment.
+This function must be called from the callback (not from the codelet).
 @end deftypefun
 @end deftypefun
 
 
 @deftypefun void starpu_codelet_display_stats ({struct starpu_codelet} *@var{cl})
 @deftypefun void starpu_codelet_display_stats ({struct starpu_codelet} *@var{cl})
@@ -3263,6 +3398,48 @@ successfull. It returns 0 if the synchronous copy was successful, or
 fails otherwise.
 fails otherwise.
 @end deftypefun
 @end deftypefun
 
 
+@node MIC extensions
+@section MIC extensions
+
+@defmac STARPU_USE_MIC
+This macro is defined when StarPU has been installed with MIC
+support. It should be used in your code to detect the availability of
+MIC.
+@end defmac
+
+@deftypefun int starpu_mic_register_kernel({starpu_mic_func_symbol_t *}@var{symbol}, {const char *}@var{func_name})
+Initiate a lookup on each MIC device to find the adress of the function
+named FUNC_NAME, store them in the global array kernels and return
+the index in the array through SYMBOL.
+@end deftypefun
+
+@deftypefun starpu_mic_kernel_t starpu_mic_get_kernel(starpu_mic_func_symbol_t @var{symbol})
+If success, return the pointer to the function defined by SYMBOL on the
+device linked to the called device. This can for instance be used in a
+@code{starpu_mic_func_t} implementation.
+@end deftypefun
+
+@node SCC extensions
+@section SCC extensions
+
+@defmac STARPU_USE_SCC
+This macro is defined when StarPU has been installed with SCC
+support. It should be used in your code to detect the availability of
+SCC.
+@end defmac
+
+@deftypefun int starpu_scc_register_kernel({starpu_scc_func_symbol_t *}@var{symbol}, {const char *}@var{func_name})
+Initiate a lookup on each SCC device to find the adress of the function
+named FUNC_NAME, store them in the global array kernels and return
+the index in the array through SYMBOL.
+@end deftypefun
+
+@deftypefun starpu_scc_kernel_t starpu_scc_get_kernel(starpu_scc_func_symbol_t @var{symbol})
+If success, return the pointer to the function defined by SYMBOL on the
+device linked to the called device. This can for instance be used in a
+@code{starpu_scc_func_t} implementation.
+@end deftypefun
+
 @node Miscellaneous helpers
 @node Miscellaneous helpers
 @section Miscellaneous helpers
 @section Miscellaneous helpers
 
 
@@ -3816,7 +3993,11 @@ number of CUDA workers.
 
 
 @item @code{unsigned nhwopenclgpus}
 @item @code{unsigned nhwopenclgpus}
 Total number of OpenCL devices, as detected. May be different from the actual
 Total number of OpenCL devices, as detected. May be different from the actual
-number of CUDA workers.
+number of OpenCL workers.
+
+@item @code{unsigned nhscc}
+Total number of SCC cores, as detected. May be different from the actual
+number of core workers.
 
 
 @item @code{unsigned ncpus}
 @item @code{unsigned ncpus}
 Actual number of CPU workers used by StarPU.
 Actual number of CPU workers used by StarPU.
@@ -3827,6 +4008,9 @@ Actual number of CUDA workers used by StarPU.
 @item @code{unsigned nopenclgpus}
 @item @code{unsigned nopenclgpus}
 Actual number of OpenCL workers used by StarPU.
 Actual number of OpenCL workers used by StarPU.
 
 
+@item @code{unsigned nsccdevices}
+Actual number of SCC workers used by StarPU.
+
 @item @code{unsigned workers_bindid[STARPU_NMAXWORKERS]}
 @item @code{unsigned workers_bindid[STARPU_NMAXWORKERS]}
 Indicates the successive cpu identifier that should be used to bind the
 Indicates the successive cpu identifier that should be used to bind the
 workers. It is either filled according to the user's explicit
 workers. It is either filled according to the user's explicit
@@ -3835,17 +4019,29 @@ variable. Otherwise, a round-robin policy is used to distributed the workers
 over the cpus.
 over the cpus.
 
 
 @item @code{unsigned workers_cuda_gpuid[STARPU_NMAXWORKERS]}
 @item @code{unsigned workers_cuda_gpuid[STARPU_NMAXWORKERS]}
-Indicates the successive cpu identifier that should be used by the CUDA
+Indicates the successive CUDA identifier that should be used by the CUDA
 driver.  It is either filled according to the user's explicit parameters (from
 driver.  It is either filled according to the user's explicit parameters (from
 starpu_conf) or according to the STARPU_WORKERS_CUDAID env. variable. Otherwise,
 starpu_conf) or according to the STARPU_WORKERS_CUDAID env. variable. Otherwise,
 they are taken in ID order.
 they are taken in ID order.
 
 
 @item @code{unsigned workers_opencl_gpuid[STARPU_NMAXWORKERS]}
 @item @code{unsigned workers_opencl_gpuid[STARPU_NMAXWORKERS]}
-Indicates the successive cpu identifier that should be used by the OpenCL
+Indicates the successive OpenCL identifier that should be used by the OpenCL
 driver.  It is either filled according to the user's explicit parameters (from
 driver.  It is either filled according to the user's explicit parameters (from
 starpu_conf) or according to the STARPU_WORKERS_OPENCLID env. variable. Otherwise,
 starpu_conf) or according to the STARPU_WORKERS_OPENCLID env. variable. Otherwise,
 they are taken in ID order.
 they are taken in ID order.
 
 
+@item @code{unsigned workers_mic_deviceid[STARPU_NMAXWORKERS]}
+Indicates the successive MIC devices that should be used by the MIC
+driver.  It is either filled according to the user's explicit parameters (from
+starpu_conf) or according to the STARPU_WORKERS_MICID env. variable. Otherwise,
+they are taken in ID order.
+
+@item @code{unsigned workers_scc_deviceid[STARPU_NMAXWORKERS]}
+Indicates the successive SCC devices that should be used by the SCC
+driver.  It is either filled according to the user's explicit parameters (from
+starpu_conf) or according to the STARPU_WORKERS_SCCID env. variable. Otherwise,
+they are taken in ID order.
+
 @end table
 @end table
 @end deftp
 @end deftp
 
 

+ 3 - 0
doc/chapters/basic-examples.texi

@@ -132,6 +132,7 @@ struct starpu_codelet cl =
 @{
 @{
     .where = STARPU_CPU,
     .where = STARPU_CPU,
     .cpu_funcs = @{ cpu_func, NULL @},
     .cpu_funcs = @{ cpu_func, NULL @},
+    .cpu_funcs_name = @{ "cpu_func", NULL @},
     .nbuffers = 0
     .nbuffers = 0
 @};
 @};
 @end smallexample
 @end smallexample
@@ -642,6 +643,7 @@ void scal_cpu_func(void *buffers[], void *cl_arg)
 struct starpu_codelet cl =
 struct starpu_codelet cl =
 @{
 @{
     .cpu_funcs = @{ scal_cpu_func, NULL @},
     .cpu_funcs = @{ scal_cpu_func, NULL @},
+    .cpu_funcs_name = @{ "scal_cpu_func", NULL @},
     .nbuffers = 1,
     .nbuffers = 1,
     .modes = @{ STARPU_RW @}
     .modes = @{ STARPU_RW @}
 @};
 @};
@@ -822,6 +824,7 @@ static struct starpu_codelet cl =
 @{
 @{
     .cuda_funcs = @{ scal_cuda_func, NULL @},
     .cuda_funcs = @{ scal_cuda_func, NULL @},
     .cpu_funcs = @{ scal_cpu_func, NULL @},
     .cpu_funcs = @{ scal_cpu_func, NULL @},
+    .cpu_funcs_name = @{ "scal_cpu_func", NULL @},
     .opencl_funcs = @{ scal_opencl_func, NULL @},
     .opencl_funcs = @{ scal_opencl_func, NULL @},
     .nbuffers = 1,
     .nbuffers = 1,
     .modes = @{ STARPU_RW @}
     .modes = @{ STARPU_RW @}

+ 1 - 0
doc/chapters/perf-optimization.texi

@@ -554,6 +554,7 @@ CUDA or OpenCL execution:
 static struct starpu_codelet cl11 =
 static struct starpu_codelet cl11 =
 @{
 @{
 	.cpu_funcs = @{chol_cpu_codelet_update_u11, NULL@},
 	.cpu_funcs = @{chol_cpu_codelet_update_u11, NULL@},
+	.cpu_funcs_name = @{"chol_cpu_codelet_update_u11", NULL@},
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = @{chol_cublas_codelet_update_u11, NULL@},
 	.cuda_funcs = @{chol_cublas_codelet_update_u11, NULL@},
 #elif defined(STARPU_SIMGRID)
 #elif defined(STARPU_SIMGRID)

+ 2 - 1
doc/chapters/vector_scal_c.texi

@@ -1,7 +1,7 @@
 @c -*-texinfo-*-
 @c -*-texinfo-*-
 
 
 @c This file is part of the StarPU Handbook.
 @c This file is part of the StarPU Handbook.
-@c Copyright (C) 2009-2011  Université de Bordeaux 1
+@c Copyright (C) 2009-2011, 2013  Université de Bordeaux 1
 @c Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
 @c Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
 @c See the file starpu.texi for copying conditions.
 @c See the file starpu.texi for copying conditions.
 
 
@@ -26,6 +26,7 @@ static struct starpu_codelet cl = @{
     .where = STARPU_CPU | STARPU_CUDA | STARPU_OPENCL,
     .where = STARPU_CPU | STARPU_CUDA | STARPU_OPENCL,
     /* CPU implementation of the codelet */
     /* CPU implementation of the codelet */
     .cpu_funcs = @{ scal_cpu_func, scal_sse_func, NULL @},
     .cpu_funcs = @{ scal_cpu_func, scal_sse_func, NULL @},
+    .cpu_funcs_name = @{ "scal_cpu_func", "scal_sse_func", NULL @},
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
     /* CUDA implementation of the codelet */
     /* CUDA implementation of the codelet */
     .cuda_funcs = @{ scal_cuda_func, NULL @},
     .cuda_funcs = @{ scal_cuda_func, NULL @},

+ 1 - 1
examples/Makefile.am

@@ -20,7 +20,7 @@ AM_CFLAGS = $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) -Wall $(STARPU_CUDA_CPPFLAGS) $(STAR
 AM_CXXFLAGS = $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS)
 AM_CXXFLAGS = $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS)
 LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ $(MAGMA_LIBS) $(HWLOC_LIBS) @LIBS@
 LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ $(MAGMA_LIBS) $(HWLOC_LIBS) @LIBS@
 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/examples/ -I$(top_builddir)/include
 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/examples/ -I$(top_builddir)/include
-AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) 
+AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(STARPU_COI_LDFLAGS)
 
 
 SUBDIRS = stencil
 SUBDIRS = stencil
 
 

+ 15 - 2
examples/basic_examples/vector_scal.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
- * Copyright (C) 2010-2012  Université de Bordeaux 1
+ * Copyright (C) 2010-2013  Université de Bordeaux 1
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -52,7 +52,7 @@ static struct starpu_perfmodel vector_scal_power_model =
 
 
 static struct starpu_codelet cl =
 static struct starpu_codelet cl =
 {
 {
-	.where = STARPU_CPU | STARPU_CUDA | STARPU_OPENCL,
+	.where = STARPU_CPU | STARPU_CUDA | STARPU_OPENCL | STARPU_MIC,
 	/* CPU implementation of the codelet */
 	/* CPU implementation of the codelet */
 	.cpu_funcs = {
 	.cpu_funcs = {
 		scal_cpu_func
 		scal_cpu_func
@@ -67,6 +67,19 @@ static struct starpu_codelet cl =
 #endif
 #endif
 		, NULL
 		, NULL
 	},
 	},
+	.cpu_funcs_name = {
+		"scal_cpu_func",
+#ifdef STARPU_HAVE_ICC
+		"scal_cpu_func_icc",
+#endif
+#ifdef __SSE__
+		"scal_sse_func",
+#ifdef STARPU_HAVE_ICC
+		"scal_sse_func_icc"
+#endif
+#endif
+	},
+
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 	/* CUDA implementation of the codelet */
 	/* CUDA implementation of the codelet */
 	.cuda_funcs = {scal_cuda_func, NULL},
 	.cuda_funcs = {scal_cuda_func, NULL},

+ 1 - 1
examples/stencil/Makefile.am

@@ -16,7 +16,7 @@
 AM_CFLAGS = $(HWLOC_CFLAGS) $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(GLOBAL_AM_CFLAGS) -Wno-unused
 AM_CFLAGS = $(HWLOC_CFLAGS) $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(GLOBAL_AM_CFLAGS) -Wno-unused
 LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ $(HWLOC_LIBS) @LIBS@
 LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ $(HWLOC_LIBS) @LIBS@
 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/examples/ -I$(top_builddir)/include
 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/examples/ -I$(top_builddir)/include
-AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS)
+AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(STARPU_COI_LDFLAGS)
 
 
 if USE_MPI
 if USE_MPI
 LIBS += $(top_builddir)/mpi/src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
 LIBS += $(top_builddir)/mpi/src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la

+ 22 - 0
include/starpu.h

@@ -92,6 +92,10 @@ struct starpu_conf
 	int ncuda;
 	int ncuda;
 	/* number of GPU OpenCL device workers (-1 for default) */
 	/* number of GPU OpenCL device workers (-1 for default) */
 	int nopencl;
 	int nopencl;
+	/* number of MIC device workers (-1 for default) */
+	int nmic;
+	/* number of SCC device workers (-1 for default) */
+	int nscc;
 
 
 	unsigned use_explicit_workers_bindid;
 	unsigned use_explicit_workers_bindid;
 	unsigned workers_bindid[STARPU_NMAXWORKERS];
 	unsigned workers_bindid[STARPU_NMAXWORKERS];
@@ -102,6 +106,12 @@ struct starpu_conf
 	unsigned use_explicit_workers_opencl_gpuid;
 	unsigned use_explicit_workers_opencl_gpuid;
 	unsigned workers_opencl_gpuid[STARPU_NMAXWORKERS];
 	unsigned workers_opencl_gpuid[STARPU_NMAXWORKERS];
 
 
+	unsigned use_explicit_workers_mic_deviceid;
+	unsigned workers_mic_deviceid[STARPU_NMAXWORKERS];
+
+	unsigned use_explicit_workers_scc_deviceid;
+	unsigned workers_scc_deviceid[STARPU_NMAXWORKERS];
+
 	/* calibrate bus (-1 for default) */
 	/* calibrate bus (-1 for default) */
 	int bus_calibrate;
 	int bus_calibrate;
 
 
@@ -111,6 +121,10 @@ struct starpu_conf
 	/* Create only one combined worker, containing all CPU workers */
 	/* Create only one combined worker, containing all CPU workers */
 	int single_combined_worker;
 	int single_combined_worker;
 
 
+	/* Path to the kernel to execute on the MIC device, compiled
+	 * for MIC architecture. */
+	char *mic_sink_program_path;
+
 	/* indicate if all asynchronous copies should be disabled */
 	/* indicate if all asynchronous copies should be disabled */
 	int disable_asynchronous_copy;
 	int disable_asynchronous_copy;
 
 
@@ -140,6 +154,12 @@ int starpu_conf_init(struct starpu_conf *conf);
  */
  */
 int starpu_init(struct starpu_conf *conf) STARPU_WARN_UNUSED_RESULT;
 int starpu_init(struct starpu_conf *conf) STARPU_WARN_UNUSED_RESULT;
 
 
+/* Alternative initialization method with argc and argv. This is use by
+ * MIC, MPI, and SCC implementation.
+ * Don't call starpu_init and starpu_initialize in the same program.
+ */
+int starpu_initialize(struct starpu_conf *user_conf, int *argc, char ***argv);
+
 /* Shutdown method: note that statistics are only generated once StarPU is
 /* Shutdown method: note that statistics are only generated once StarPU is
  * shutdown */
  * shutdown */
 void starpu_shutdown(void);
 void starpu_shutdown(void);
@@ -156,6 +176,8 @@ void starpu_display_stats();
 
 
 void starpu_get_version(int *major, int *minor, int *release);
 void starpu_get_version(int *major, int *minor, int *release);
 
 
+int starpu_worker_get_mp_nodeid(int id);
+
 #ifdef __cplusplus
 #ifdef __cplusplus
 }
 }
 #endif
 #endif

+ 5 - 0
include/starpu_config.h.in

@@ -25,6 +25,8 @@
 #undef STARPU_USE_CPU
 #undef STARPU_USE_CPU
 #undef STARPU_USE_CUDA
 #undef STARPU_USE_CUDA
 #undef STARPU_USE_OPENCL
 #undef STARPU_USE_OPENCL
+#undef STARPU_USE_MIC
+#undef STARPU_USE_SCC
 
 
 #undef STARPU_SIMGRID
 #undef STARPU_SIMGRID
 
 
@@ -70,9 +72,12 @@
 #undef STARPU_MAXCPUS
 #undef STARPU_MAXCPUS
 #undef STARPU_MAXCUDADEVS
 #undef STARPU_MAXCUDADEVS
 #undef STARPU_MAXOPENCLDEVS
 #undef STARPU_MAXOPENCLDEVS
+#undef STARPU_MAXMICDEVS
+#undef STARPU_MAXSCCDEVS
 #undef STARPU_NMAXWORKERS
 #undef STARPU_NMAXWORKERS
 #undef STARPU_NMAX_SCHED_CTXS
 #undef STARPU_NMAX_SCHED_CTXS
 #undef STARPU_MAXIMPLEMENTATIONS
 #undef STARPU_MAXIMPLEMENTATIONS
+#undef STARPU_MAXMPKERNELS
 #undef STARPU_USE_SC_HYPERVISOR
 #undef STARPU_USE_SC_HYPERVISOR
 #undef STARPU_HAVE_GLPK_H
 #undef STARPU_HAVE_GLPK_H
 
 

+ 8 - 1
include/starpu_data.h

@@ -102,7 +102,14 @@ enum starpu_node_kind
 	STARPU_UNUSED     = 0x00,
 	STARPU_UNUSED     = 0x00,
 	STARPU_CPU_RAM    = 0x01,
 	STARPU_CPU_RAM    = 0x01,
 	STARPU_CUDA_RAM   = 0x02,
 	STARPU_CUDA_RAM   = 0x02,
-	STARPU_OPENCL_RAM = 0x03
+	STARPU_OPENCL_RAM = 0x03,
+	STARPU_MIC_RAM    = 0x05,
+
+	/* This node kind is not used anymore, but implementations in interfaces
+	 * will be useful for MPI. */
+	STARPU_SCC_RAM    = 0x06,
+
+	STARPU_SCC_SHM    = 0x07
 };
 };
 
 
 unsigned starpu_worker_get_memory_node(unsigned workerid);
 unsigned starpu_worker_get_memory_node(unsigned workerid);

+ 57 - 2
include/starpu_data_interfaces.h

@@ -45,6 +45,7 @@ struct starpu_data_copy_methods
 	int (*ram_to_ram)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
 	int (*ram_to_ram)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
 	int (*ram_to_cuda)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
 	int (*ram_to_cuda)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
 	int (*ram_to_opencl)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
 	int (*ram_to_opencl)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
+	int (*ram_to_mic)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
 
 
 	/* src type is cuda */
 	/* src type is cuda */
 	int (*cuda_to_ram)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
 	int (*cuda_to_ram)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
@@ -56,6 +57,14 @@ struct starpu_data_copy_methods
 	int (*opencl_to_cuda)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
 	int (*opencl_to_cuda)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
 	int (*opencl_to_opencl)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
 	int (*opencl_to_opencl)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
 
 
+	/* src type is mic */
+	int (*mic_to_ram)(void *src_interface, unsigned srd_node, void *dst_interface, unsigned dst_node);
+
+	/* scc case */
+	int (*scc_src_to_sink)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
+	int (*scc_sink_to_src)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
+	int (*scc_sink_to_sink)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
+
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 	/* for asynchronous CUDA transfers */
 	/* for asynchronous CUDA transfers */
 	int (*ram_to_cuda_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream);
 	int (*ram_to_cuda_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream);
@@ -74,6 +83,12 @@ struct starpu_data_copy_methods
 	int (*opencl_to_opencl_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cl_event *event);
 	int (*opencl_to_opencl_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cl_event *event);
 #endif
 #endif
 
 
+#ifdef STARPU_USE_MIC
+	/* Asynchronous MIC transfers */
+	int (*ram_to_mic_async)(void *src_intreface, unsigned src_node, void *dst_interface, unsigned dst_node);
+	int (*mic_to_ram_async)(void *src_interface, unsigned srd_node, void *dst_interface, unsigned dst_node);
+#endif
+
 	int (*any_to_any)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
 	int (*any_to_any)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
 };
 };
 
 
@@ -162,6 +177,8 @@ extern struct starpu_data_interface_ops starpu_interface_matrix_ops;
 /* Matrix interface for dense matrices */
 /* Matrix interface for dense matrices */
 struct starpu_matrix_interface
 struct starpu_matrix_interface
 {
 {
+	enum starpu_data_interface_id id;
+
 	uintptr_t ptr;
 	uintptr_t ptr;
 	uintptr_t dev_handle;
 	uintptr_t dev_handle;
 	size_t offset;
 	size_t offset;
@@ -192,6 +209,8 @@ size_t starpu_matrix_get_elemsize(starpu_data_handle_t handle);
  */
  */
 struct starpu_coo_interface
 struct starpu_coo_interface
 {
 {
+	enum starpu_data_interface_id id;
+
 	uint32_t  *columns;
 	uint32_t  *columns;
 	uint32_t  *rows;
 	uint32_t  *rows;
 	uintptr_t values;
 	uintptr_t values;
@@ -229,6 +248,8 @@ void starpu_coo_data_register(starpu_data_handle_t *handleptr, unsigned home_nod
 /* TODO: rename to 3dmatrix? */
 /* TODO: rename to 3dmatrix? */
 struct starpu_block_interface
 struct starpu_block_interface
 {
 {
+	enum starpu_data_interface_id id;
+
 	uintptr_t ptr;
 	uintptr_t ptr;
 	uintptr_t dev_handle;
 	uintptr_t dev_handle;
 	size_t offset;
 	size_t offset;
@@ -263,6 +284,8 @@ size_t starpu_block_get_elemsize(starpu_data_handle_t handle);
 /* vector interface for contiguous (non-strided) buffers */
 /* vector interface for contiguous (non-strided) buffers */
 struct starpu_vector_interface
 struct starpu_vector_interface
 {
 {
+	enum starpu_data_interface_id id;
+
 	uintptr_t ptr;
 	uintptr_t ptr;
 	uintptr_t dev_handle;
 	uintptr_t dev_handle;
 	size_t offset;
 	size_t offset;
@@ -285,9 +308,12 @@ uintptr_t starpu_vector_get_local_ptr(starpu_data_handle_t handle);
 /* variable interface for a single data (not a vector, a matrix, a list, ...) */
 /* variable interface for a single data (not a vector, a matrix, a list, ...) */
 struct starpu_variable_interface
 struct starpu_variable_interface
 {
 {
+	enum starpu_data_interface_id id;
+
 	uintptr_t ptr;
 	uintptr_t ptr;
+	uintptr_t dev_handle;
+	size_t offset;
 	size_t elemsize;
 	size_t elemsize;
-	/* No dev_handle, since it can not be filtered, offset will always be zero */
 };
 };
 
 
 void starpu_variable_data_register(starpu_data_handle_t *handle, unsigned home_node, uintptr_t ptr, size_t size);
 void starpu_variable_data_register(starpu_data_handle_t *handle, unsigned home_node, uintptr_t ptr, size_t size);
@@ -296,10 +322,10 @@ uintptr_t starpu_variable_get_local_ptr(starpu_data_handle_t handle);
 
 
 /* helper methods */
 /* helper methods */
 #define STARPU_VARIABLE_GET_PTR(interface)	(((struct starpu_variable_interface *)(interface))->ptr)
 #define STARPU_VARIABLE_GET_PTR(interface)	(((struct starpu_variable_interface *)(interface))->ptr)
+#define STARPU_VARIABLE_GET_OFFSET(interface)	(((struct starpu_variable_interface *)(interface))->offset)
 #define STARPU_VARIABLE_GET_ELEMSIZE(interface)	(((struct starpu_variable_interface *)(interface))->elemsize)
 #define STARPU_VARIABLE_GET_ELEMSIZE(interface)	(((struct starpu_variable_interface *)(interface))->elemsize)
 #define STARPU_VARIABLE_GET_DEV_HANDLE(interface) \
 #define STARPU_VARIABLE_GET_DEV_HANDLE(interface) \
 	(((struct starpu_variable_interface *)(interface))->ptr)
 	(((struct starpu_variable_interface *)(interface))->ptr)
-#define STARPU_VARIABLE_GET_OFFSET 0
 
 
 /* void interface. There is no data really associated to that interface, but it
 /* void interface. There is no data really associated to that interface, but it
  * may be used as a synchronization mechanism. It also permits to express an
  * may be used as a synchronization mechanism. It also permits to express an
@@ -311,6 +337,8 @@ void starpu_void_data_register(starpu_data_handle_t *handle);
 /* CSR interface for sparse matrices (compressed sparse row representation) */
 /* CSR interface for sparse matrices (compressed sparse row representation) */
 struct starpu_csr_interface
 struct starpu_csr_interface
 {
 {
+	enum starpu_data_interface_id id;
+
 	uint32_t nnz; /* number of non-zero entries */
 	uint32_t nnz; /* number of non-zero entries */
 	uint32_t nrow; /* number of rows */
 	uint32_t nrow; /* number of rows */
 	uintptr_t nzval; /* non-zero values */
 	uintptr_t nzval; /* non-zero values */
@@ -352,6 +380,8 @@ size_t starpu_csr_get_elemsize(starpu_data_handle_t handle);
  * representation) */
  * representation) */
 struct starpu_bcsr_interface
 struct starpu_bcsr_interface
 {
 {
+	enum starpu_data_interface_id id;
+
 	uint32_t nnz; /* number of non-zero BLOCKS */
 	uint32_t nnz; /* number of non-zero BLOCKS */
 	uint32_t nrow; /* number of rows (in terms of BLOCKS) */
 	uint32_t nrow; /* number of rows (in terms of BLOCKS) */
 
 
@@ -406,13 +436,22 @@ struct starpu_multiformat_data_interface_ops
 	size_t cuda_elemsize;
 	size_t cuda_elemsize;
 	struct starpu_codelet *cpu_to_cuda_cl;
 	struct starpu_codelet *cpu_to_cuda_cl;
 	struct starpu_codelet *cuda_to_cpu_cl;
 	struct starpu_codelet *cuda_to_cpu_cl;
+	size_t mic_elemsize;
+	struct starpu_codelet *cpu_to_mic_cl;
+	struct starpu_codelet *mic_to_cpu_cl;
+	size_t scc_elemsize;
+	struct starpu_codelet *cpu_to_scc_cl;
+	struct starpu_codelet *scc_to_cpu_cl;
 };
 };
 
 
 struct starpu_multiformat_interface
 struct starpu_multiformat_interface
 {
 {
+	enum starpu_data_interface_id id;
+
 	void *cpu_ptr;
 	void *cpu_ptr;
 	void *cuda_ptr;
 	void *cuda_ptr;
 	void *opencl_ptr;
 	void *opencl_ptr;
+	void *mic_ptr;
 	uint32_t nx;
 	uint32_t nx;
 	struct starpu_multiformat_data_interface_ops *ops;
 	struct starpu_multiformat_data_interface_ops *ops;
 };
 };
@@ -422,8 +461,24 @@ void starpu_multiformat_data_register(starpu_data_handle_t *handle, unsigned hom
 #define STARPU_MULTIFORMAT_GET_CPU_PTR(interface)  (((struct starpu_multiformat_interface *)(interface))->cpu_ptr)
 #define STARPU_MULTIFORMAT_GET_CPU_PTR(interface)  (((struct starpu_multiformat_interface *)(interface))->cpu_ptr)
 #define STARPU_MULTIFORMAT_GET_CUDA_PTR(interface) (((struct starpu_multiformat_interface *)(interface))->cuda_ptr)
 #define STARPU_MULTIFORMAT_GET_CUDA_PTR(interface) (((struct starpu_multiformat_interface *)(interface))->cuda_ptr)
 #define STARPU_MULTIFORMAT_GET_OPENCL_PTR(interface) (((struct starpu_multiformat_interface *)(interface))->opencl_ptr)
 #define STARPU_MULTIFORMAT_GET_OPENCL_PTR(interface) (((struct starpu_multiformat_interface *)(interface))->opencl_ptr)
+#define STARPU_MULTIFORMAT_GET_MIC_PTR(interface) (((struct starpu_multiformat_interface *)(interface))->mic_ptr)
 #define STARPU_MULTIFORMAT_GET_NX(interface)  (((struct starpu_multiformat_interface *)(interface))->nx)
 #define STARPU_MULTIFORMAT_GET_NX(interface)  (((struct starpu_multiformat_interface *)(interface))->nx)
 
 
+/* Generic type representing an interface, for now it's only used before
+ * execution on message-passing devices but it can be useful in other cases.
+ */
+union _starpu_interface
+{
+	struct starpu_matrix_interface matrix;
+	struct starpu_block_interface block;
+	struct starpu_vector_interface vector;
+	struct starpu_csr_interface csr;
+	struct starpu_coo_interface coo;
+	struct starpu_bcsr_interface bcsr;
+	struct starpu_variable_interface variable;
+	struct starpu_multiformat_interface multiformat;
+};
+
 enum starpu_data_interface_id starpu_data_get_interface_id(starpu_data_handle_t handle);
 enum starpu_data_interface_id starpu_data_get_interface_id(starpu_data_handle_t handle);
 
 
 int starpu_data_pack(starpu_data_handle_t handle, void **ptr, starpu_ssize_t *count);
 int starpu_data_pack(starpu_data_handle_t handle, void **ptr, starpu_ssize_t *count);

+ 35 - 0
include/starpu_mic.h

@@ -0,0 +1,35 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012  Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+
+#ifndef __STARPU_MIC_H__
+#define __STARPU_MIC_H__
+
+#include <starpu_config.h>
+
+
+#ifdef STARPU_USE_MIC
+
+typedef void *starpu_mic_func_symbol_t;
+
+int starpu_mic_register_kernel(starpu_mic_func_symbol_t *symbol, const char *func_name);
+
+starpu_mic_kernel_t starpu_mic_get_kernel(starpu_mic_func_symbol_t symbol);
+
+#endif /* STARPU_USE_MIC */
+
+
+#endif /* __STARPU_MIC_H__ */

+ 10 - 4
include/starpu_perfmodel.h

@@ -43,8 +43,10 @@ enum starpu_perfmodel_archtype
 	STARPU_CPU_DEFAULT = 0,
 	STARPU_CPU_DEFAULT = 0,
 	/* CPU combined workers between 0 and STARPU_MAXCPUS-1 */
 	/* CPU combined workers between 0 and STARPU_MAXCPUS-1 */
 	STARPU_CUDA_DEFAULT = STARPU_MAXCPUS,
 	STARPU_CUDA_DEFAULT = STARPU_MAXCPUS,
-	STARPU_OPENCL_DEFAULT = STARPU_CUDA_DEFAULT + STARPU_MAXCUDADEVS
+	STARPU_OPENCL_DEFAULT = STARPU_CUDA_DEFAULT + STARPU_MAXCUDADEVS,
 	/* STARPU_OPENCL_DEFAULT + devid */
 	/* STARPU_OPENCL_DEFAULT + devid */
+	STARPU_MIC_DEFAULT = STARPU_OPENCL_DEFAULT + STARPU_MAXOPENCLDEVS,
+	STARPU_SCC_DEFAULT = STARPU_MIC_DEFAULT + STARPU_MAXMICDEVS
 };
 };
 
 
 #ifdef __STDC_VERSION__
 #ifdef __STDC_VERSION__
@@ -54,15 +56,19 @@ enum starpu_perfmodel_archtype
 
 
 _Static_assert(STARPU_CPU_DEFAULT == 0,
 _Static_assert(STARPU_CPU_DEFAULT == 0,
 	       "invalid STARPU_CPU_DEFAULT value");
 	       "invalid STARPU_CPU_DEFAULT value");
-_Static_assert(STARPU_CUDA_DEFAULT > STARPU_CPU_DEFAULT,
-	       "invalid STARPU_CPU_DEFAULT value");
+_Static_assert(STARPU_CPU_DEFAULT < STARPU_CUDA_DEFAULT,
+	       "invalid STARPU_{CPU,CUDA}_DEFAULT values");
 _Static_assert(STARPU_CUDA_DEFAULT < STARPU_OPENCL_DEFAULT,
 _Static_assert(STARPU_CUDA_DEFAULT < STARPU_OPENCL_DEFAULT,
 	       "invalid STARPU_{CUDA,OPENCL}_DEFAULT values");
 	       "invalid STARPU_{CUDA,OPENCL}_DEFAULT values");
+_Static_assert(STARPU_OPENCL_DEFAULT < STARPU_MIC_DEFAULT,
+	       "invalid STARPU_{OPENCL,MIC}_DEFAULT values");
+_Static_assert(STARPU_MIC_DEFAULT < STARPU_SCC_DEFAULT,
+	       "invalid STARPU_{MIC,SCC}_DEFAULT values");
 
 
 #  endif
 #  endif
 #endif
 #endif
 
 
-#define STARPU_NARCH_VARIATIONS	(STARPU_OPENCL_DEFAULT + STARPU_MAXOPENCLDEVS)
+#define STARPU_NARCH_VARIATIONS	(STARPU_MIC_DEFAULT + STARPU_MAXMICDEVS)
 
 
 struct starpu_perfmodel_history_entry
 struct starpu_perfmodel_history_entry
 {
 {

+ 35 - 0
include/starpu_scc.h

@@ -0,0 +1,35 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012  Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+
+#ifndef __STARPU_SCC_H__
+#define __STARPU_SCC_H__
+
+#include <starpu_config.h>
+
+
+#ifdef STARPU_USE_SCC
+
+typedef void *starpu_scc_func_symbol_t;
+
+int starpu_scc_register_kernel(starpu_scc_func_symbol_t *symbol, const char *func_name);
+
+starpu_scc_kernel_t starpu_scc_get_kernel(starpu_scc_func_symbol_t symbol);
+
+#endif /* STARPU_USE_SCC */
+
+
+#endif /* __STARPU_SCC_H__ */

+ 23 - 0
include/starpu_sink.h

@@ -0,0 +1,23 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012  Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+
+#ifndef __STARPU_SINK_H__
+#define __STARPU_SINK_H__
+
+void starpu_sink_common_worker(int argc, char **argv);
+
+#endif /* __STARPU_SINK_H__ */

+ 13 - 0
include/starpu_task.h

@@ -37,6 +37,8 @@ extern "C"
 #define STARPU_CPU	((1ULL)<<1)
 #define STARPU_CPU	((1ULL)<<1)
 #define STARPU_CUDA	((1ULL)<<3)
 #define STARPU_CUDA	((1ULL)<<3)
 #define STARPU_OPENCL	((1ULL)<<6)
 #define STARPU_OPENCL	((1ULL)<<6)
+#define STARPU_MIC	((1ULL)<<7)
+#define STARPU_SCC	((1ULL)<<8)
 
 
 /* Codelet types */
 /* Codelet types */
 enum starpu_codelet_type
 enum starpu_codelet_type
@@ -65,6 +67,11 @@ typedef uint64_t starpu_tag_t;
 typedef void (*starpu_cpu_func_t)(void **, void*);    /* CPU core */
 typedef void (*starpu_cpu_func_t)(void **, void*);    /* CPU core */
 typedef void (*starpu_cuda_func_t)(void **, void*);   /* NVIDIA CUDA device */
 typedef void (*starpu_cuda_func_t)(void **, void*);   /* NVIDIA CUDA device */
 typedef void (*starpu_opencl_func_t)(void **, void*); /* OpenCL CUDA device */
 typedef void (*starpu_opencl_func_t)(void **, void*); /* OpenCL CUDA device */
+typedef void (*starpu_mic_kernel_t)(void **, void*); /* MIC device */
+typedef void (*starpu_scc_kernel_t)(void **, void*); /* SCC device */
+
+typedef starpu_mic_kernel_t (*starpu_mic_func_t)(void);
+typedef starpu_scc_kernel_t (*starpu_scc_func_t)(void);
 
 
 #define STARPU_MULTIPLE_CPU_IMPLEMENTATIONS    ((starpu_cpu_func_t) -1)
 #define STARPU_MULTIPLE_CPU_IMPLEMENTATIONS    ((starpu_cpu_func_t) -1)
 #define STARPU_MULTIPLE_CUDA_IMPLEMENTATIONS   ((starpu_cuda_func_t) -1)
 #define STARPU_MULTIPLE_CUDA_IMPLEMENTATIONS   ((starpu_cuda_func_t) -1)
@@ -91,6 +98,10 @@ struct starpu_codelet
 	starpu_cpu_func_t cpu_funcs[STARPU_MAXIMPLEMENTATIONS];
 	starpu_cpu_func_t cpu_funcs[STARPU_MAXIMPLEMENTATIONS];
 	starpu_cuda_func_t cuda_funcs[STARPU_MAXIMPLEMENTATIONS];
 	starpu_cuda_func_t cuda_funcs[STARPU_MAXIMPLEMENTATIONS];
 	starpu_opencl_func_t opencl_funcs[STARPU_MAXIMPLEMENTATIONS];
 	starpu_opencl_func_t opencl_funcs[STARPU_MAXIMPLEMENTATIONS];
+	starpu_mic_func_t mic_funcs[STARPU_MAXIMPLEMENTATIONS];
+	starpu_scc_func_t scc_funcs[STARPU_MAXIMPLEMENTATIONS];
+
+	char *cpu_funcs_name[STARPU_MAXIMPLEMENTATIONS];
 
 
 	/* how many buffers do the codelet takes as argument ? */
 	/* how many buffers do the codelet takes as argument ? */
 	unsigned nbuffers;
 	unsigned nbuffers;
@@ -128,6 +139,8 @@ struct starpu_task
 	void *cl_arg;
 	void *cl_arg;
 	/* in case the argument buffer has to be uploaded explicitely */
 	/* in case the argument buffer has to be uploaded explicitely */
 	size_t cl_arg_size;
 	size_t cl_arg_size;
+	/* must StarPU release cl_arg ? - 0 by default */
+	unsigned cl_arg_free;
 
 
 	/* when the task is done, callback_func(callback_arg) is called */
 	/* when the task is done, callback_func(callback_arg) is called */
 	void (*callback_func)(void *);
 	void (*callback_func)(void *);

+ 33 - 1
include/starpu_worker.h

@@ -36,7 +36,17 @@ enum starpu_worker_archtype
 	STARPU_ANY_WORKER,    /* any worker, used in the hypervisor */
 	STARPU_ANY_WORKER,    /* any worker, used in the hypervisor */
 	STARPU_CPU_WORKER,    /* CPU core */
 	STARPU_CPU_WORKER,    /* CPU core */
 	STARPU_CUDA_WORKER,   /* NVIDIA CUDA device */
 	STARPU_CUDA_WORKER,   /* NVIDIA CUDA device */
-	STARPU_OPENCL_WORKER  /* OpenCL device */
+	STARPU_OPENCL_WORKER, /* OpenCL device */
+	STARPU_MIC_WORKER,    /* Intel MIC device */
+	STARPU_SCC_WORKER     /* Intel SCC device */
+};
+
+/* Represent the topology of sink devices, contains useful informations about
+ * their capabilities */
+// XXX: unused.
+struct starpu_sink_topology
+{
+	unsigned nb_cpus;
 };
 };
 
 
 struct starpu_sched_ctx_iterator
 struct starpu_sched_ctx_iterator
@@ -61,10 +71,20 @@ struct starpu_machine_topology
 	unsigned nhwcpus;
 	unsigned nhwcpus;
 	unsigned nhwcudagpus;
 	unsigned nhwcudagpus;
 	unsigned nhwopenclgpus;
 	unsigned nhwopenclgpus;
+	unsigned nhwscc;
 
 
 	unsigned ncpus;
 	unsigned ncpus;
 	unsigned ncudagpus;
 	unsigned ncudagpus;
 	unsigned nopenclgpus;
 	unsigned nopenclgpus;
+	unsigned nsccdevices;
+
+	/* Topology of MP nodes (mainly MIC and SCC) as well as necessary
+	 * objects to communicate with them. */
+	unsigned nhwmicdevices;
+	unsigned nmicdevices;
+
+	unsigned nhwmiccores[STARPU_MAXMICDEVS]; // Each MIC node has its set of cores.
+	unsigned nmiccores[STARPU_MAXMICDEVS];
 
 
 	/* Where to bind workers ? */
 	/* Where to bind workers ? */
 	unsigned workers_bindid[STARPU_NMAXWORKERS];
 	unsigned workers_bindid[STARPU_NMAXWORKERS];
@@ -74,6 +94,12 @@ struct starpu_machine_topology
 
 
 	/* Which GPU(s) do we use for OpenCL ? */
 	/* Which GPU(s) do we use for OpenCL ? */
 	unsigned workers_opencl_gpuid[STARPU_NMAXWORKERS];
 	unsigned workers_opencl_gpuid[STARPU_NMAXWORKERS];
+
+	/* Which MIC core(s) do we use ? */
+	/* unsigned workers_mic_deviceid[STARPU_NMAXWORKERS]; */
+
+	/* Which SCC(s) do we use ? */
+	unsigned workers_scc_deviceid[STARPU_NMAXWORKERS];
 };
 };
 
 
 /* generic structure used by the scheduling contexts to iterate the workers */
 /* generic structure used by the scheduling contexts to iterate the workers */
@@ -113,6 +139,10 @@ unsigned starpu_worker_is_combined_worker(int id);
 unsigned starpu_cpu_worker_get_count(void);
 unsigned starpu_cpu_worker_get_count(void);
 unsigned starpu_cuda_worker_get_count(void);
 unsigned starpu_cuda_worker_get_count(void);
 unsigned starpu_opencl_worker_get_count(void);
 unsigned starpu_opencl_worker_get_count(void);
+unsigned starpu_mic_worker_get_count(void);
+unsigned starpu_scc_worker_get_count(void);
+
+unsigned starpu_mic_device_get_count(void);
 
 
 /* Return the identifier of the thread in case this is associated to a worker.
 /* Return the identifier of the thread in case this is associated to a worker.
  * This will return -1 if this function is called directly from the application
  * This will return -1 if this function is called directly from the application
@@ -166,6 +196,8 @@ void starpu_worker_get_name(int id, char *dst, size_t maxlen);
  */
  */
 int starpu_worker_get_devid(int id);
 int starpu_worker_get_devid(int id);
 
 
+int starpu_worker_get_mp_nodeid(int id);
+
 #ifdef __cplusplus
 #ifdef __cplusplus
 }
 }
 #endif
 #endif

+ 28 - 0
libstarpu-mic.pc.in

@@ -0,0 +1,28 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2009-2011, 2013  Université de Bordeaux 1
+# Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
+prefix=@prefix@
+exec_prefix=@exec_prefix@
+libdir=@libdir@
+includedir=@includedir@
+
+Name: starpu
+Description: offers support for heterogeneous multicore architecture
+Version: @PACKAGE_VERSION@
+Cflags: -I${includedir}/starpu/@STARPU_EFFECTIVE_VERSION@ @STARPU_CUDA_CPPFLAGS@ -DSTARPU_USE_DEPRECATED_API
+Libs: @STARPU_EXPORT_DYNAMIC@ -L${libdir} -lstarpu-@STARPU_EFFECTIVE_VERSION@ @STARPU_CUDA_LDFLAGS@ @STARPU_OPENCL_LDFLAGS@ @STARPU_SC_HYPERVISOR@
+Libs.private: @LDFLAGS@ @LIBS@
+Requires: @HWLOC_REQUIRES@

+ 1 - 1
libstarpu.pc.in

@@ -23,6 +23,6 @@ Name: starpu
 Description: offers support for heterogeneous multicore architecture
 Description: offers support for heterogeneous multicore architecture
 Version: @PACKAGE_VERSION@
 Version: @PACKAGE_VERSION@
 Cflags: -I${includedir}/starpu/@STARPU_EFFECTIVE_VERSION@ @STARPU_CUDA_CPPFLAGS@ -DSTARPU_USE_DEPRECATED_API
 Cflags: -I${includedir}/starpu/@STARPU_EFFECTIVE_VERSION@ @STARPU_CUDA_CPPFLAGS@ -DSTARPU_USE_DEPRECATED_API
-Libs: -L${libdir} -lstarpu-@STARPU_EFFECTIVE_VERSION@ @STARPU_CUDA_LDFLAGS@ @STARPU_OPENCL_LDFLAGS@ @STARPU_SC_HYPERVISOR@
+Libs: @STARPU_EXPORT_DYNAMIC@ -L${libdir} -lstarpu-@STARPU_EFFECTIVE_VERSION@ @STARPU_CUDA_LDFLAGS@ @STARPU_OPENCL_LDFLAGS@ @STARPU_SC_HYPERVISOR@
 Libs.private: @LDFLAGS@ @LIBS@
 Libs.private: @LDFLAGS@ @LIBS@
 Requires: @HWLOC_REQUIRES@
 Requires: @HWLOC_REQUIRES@

+ 47 - 3
src/Makefile.am

@@ -49,10 +49,10 @@ endif STARPU_HAVE_WINDOWS
 
 
 lib_LTLIBRARIES = libstarpu-@STARPU_EFFECTIVE_VERSION@.la
 lib_LTLIBRARIES = libstarpu-@STARPU_EFFECTIVE_VERSION@.la
 
 
-libstarpu_@STARPU_EFFECTIVE_VERSION@_la_CPPFLAGS = -I$(top_srcdir)/include/ -DBUILDING_STARPU
+libstarpu_@STARPU_EFFECTIVE_VERSION@_la_CPPFLAGS = -I$(top_srcdir)/include/ $(STARPU_RCCE_CPPFLAGS) -DBUILDING_STARPU
 
 
-libstarpu_@STARPU_EFFECTIVE_VERSION@_la_CFLAGS = $(GLOBAL_AM_CFLAGS) $(HWLOC_CFLAGS) $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS)
-libstarpu_@STARPU_EFFECTIVE_VERSION@_la_LIBADD = -lm $(HWLOC_LIBS) $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(FXT_LIBS) $(STARPU_GLPK_LDFLAGS)
+libstarpu_@STARPU_EFFECTIVE_VERSION@_la_CFLAGS = $(GLOBAL_AM_CFLAGS) $(HWLOC_CFLAGS) $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(STARPU_COI_CPPFLAGS) $(STARPU_RCCE_CFLAGS) $(FXT_CFLAGS)
+libstarpu_@STARPU_EFFECTIVE_VERSION@_la_LIBADD = -lm $(HWLOC_LIBS) $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(STARPU_COI_LDFLAGS) $(STARPU_RCCE_LDFLAGS) $(FXT_LIBS) $(STARPU_GLPK_LDFLAGS)
 libstarpu_@STARPU_EFFECTIVE_VERSION@_la_LDFLAGS = $(ldflags) $(FXT_LDFLAGS) -no-undefined									\
 libstarpu_@STARPU_EFFECTIVE_VERSION@_la_LDFLAGS = $(ldflags) $(FXT_LDFLAGS) -no-undefined									\
   -version-info $(libstarpu_so_version)
   -version-info $(libstarpu_so_version)
 
 
@@ -105,11 +105,20 @@ noinst_HEADERS = 						\
 	common/uthash.h						\
 	common/uthash.h						\
 	common/barrier_counter.h				\
 	common/barrier_counter.h				\
 	drivers/driver_common/driver_common.h			\
 	drivers/driver_common/driver_common.h			\
+	drivers/mp_common/mp_common.h				\
+	drivers/mp_common/source_common.h			\
+	drivers/mp_common/sink_common.h				\
 	drivers/cpu/driver_cpu.h				\
 	drivers/cpu/driver_cpu.h				\
 	drivers/cuda/driver_cuda.h				\
 	drivers/cuda/driver_cuda.h				\
 	drivers/opencl/driver_opencl.h				\
 	drivers/opencl/driver_opencl.h				\
 	drivers/opencl/driver_opencl_utils.h			\
 	drivers/opencl/driver_opencl_utils.h			\
 	debug/starpu_debug_helpers.h				\
 	debug/starpu_debug_helpers.h				\
+	drivers/mic/driver_mic_common.h				\
+	drivers/mic/driver_mic_source.h				\
+	drivers/mic/driver_mic_sink.h				\
+	drivers/scc/driver_scc_common.h				\
+	drivers/scc/driver_scc_source.h				\
+	drivers/scc/driver_scc_sink.h				\
 	debug/traces/starpu_fxt.h				\
 	debug/traces/starpu_fxt.h				\
 	profiling/bound.h					\
 	profiling/bound.h					\
 	profiling/profiling.h					\
 	profiling/profiling.h					\
@@ -244,5 +253,40 @@ libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/opencl/driver_opencl.
 endif
 endif
 endif
 endif
 
 
+if STARPU_USE_SCC
+libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/scc/driver_scc_common.c
+libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/scc/driver_scc_source.c
+libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/scc/driver_scc_sink.c
+libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/scc/driver_scc_utils.c
+endif
+
+
+#########################################
+#										#
+#        Generic MP compilation			#
+#										#
+#########################################
+
+if STARPU_USE_MP
+libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/mp_common/mp_common.c
+libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/mp_common/source_common.c
+libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/mp_common/sink_common.c
+endif
+
+#########################################
+#										#
+#	     MIC compilation				#
+#										#
+#########################################
+
+if STARPU_USE_MIC
+libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/mic/driver_mic_common.c
+libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/mic/driver_mic_source.c
+libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/mic/driver_mic_sink.c
+libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/mic/driver_mic_utils.c
+endif
+
+#########################################
+
 showcheck:
 showcheck:
 	-cat /dev/null
 	-cat /dev/null

+ 2 - 0
src/common/fxt.h

@@ -37,6 +37,8 @@
 #define _STARPU_FUT_CPU_KEY	0x101
 #define _STARPU_FUT_CPU_KEY	0x101
 #define _STARPU_FUT_CUDA_KEY	0x102
 #define _STARPU_FUT_CUDA_KEY	0x102
 #define _STARPU_FUT_OPENCL_KEY	0x103
 #define _STARPU_FUT_OPENCL_KEY	0x103
+#define _STARPU_FUT_MIC_KEY	0x104
+#define _STARPU_FUT_SCC_KEY	0x105
 
 
 #define _STARPU_FUT_WORKER_INIT_START	0x5100
 #define _STARPU_FUT_WORKER_INIT_START	0x5100
 #define _STARPU_FUT_WORKER_INIT_END	0x5101
 #define _STARPU_FUT_WORKER_INIT_END	0x5101

+ 6 - 0
src/core/jobs.h

@@ -51,6 +51,8 @@ typedef void (*_starpu_cl_func_t)(void **, void *);
 #define _STARPU_CPU_MAY_PERFORM(j)	((j)->task->cl->where & STARPU_CPU)
 #define _STARPU_CPU_MAY_PERFORM(j)	((j)->task->cl->where & STARPU_CPU)
 #define _STARPU_CUDA_MAY_PERFORM(j)      ((j)->task->cl->where & STARPU_CUDA)
 #define _STARPU_CUDA_MAY_PERFORM(j)      ((j)->task->cl->where & STARPU_CUDA)
 #define _STARPU_OPENCL_MAY_PERFORM(j)	((j)->task->cl->where & STARPU_OPENCL)
 #define _STARPU_OPENCL_MAY_PERFORM(j)	((j)->task->cl->where & STARPU_OPENCL)
+#define _STARPU_MIC_MAY_PERFORM(j)	((j)->task->cl->where & STARPU_MIC)
+#define _STARPU_SCC_MAY_PERFORM(j)	((j)->task->cl->where & STARPU_SCC)
 
 
 /* A job is the internal representation of a task. */
 /* A job is the internal representation of a task. */
 LIST_TYPE(_starpu_job,
 LIST_TYPE(_starpu_job,
@@ -116,6 +118,10 @@ LIST_TYPE(_starpu_job,
 	 * so we need a flag to differentiate them from "normal" tasks. */
 	 * so we need a flag to differentiate them from "normal" tasks. */
 	unsigned reduction_task;
 	unsigned reduction_task;
 
 
+	/* Used by MIC driver to record codelet start time instead of using a
+	 * local variable */
+	struct timespec cl_start;
+
 #ifdef STARPU_USE_FXT
 #ifdef STARPU_USE_FXT
 	/* A symbol name may be associated to the job directly for debug
 	/* A symbol name may be associated to the job directly for debug
 	 * purposes (for instance if the codelet is NULL). */
 	 * purposes (for instance if the codelet is NULL). */

+ 51 - 4
src/core/perfmodel/perfmodel_bus.c

@@ -67,6 +67,7 @@ static unsigned was_benchmarked = 0;
 static unsigned ncpus = 0;
 static unsigned ncpus = 0;
 static unsigned ncuda = 0;
 static unsigned ncuda = 0;
 static unsigned nopencl = 0;
 static unsigned nopencl = 0;
+static unsigned nmic = 0;
 
 
 /* Benchmarking the performance of the bus */
 /* Benchmarking the performance of the bus */
 
 
@@ -91,6 +92,11 @@ static double opencldev_latency_dtoh[STARPU_MAXNODES] = {0.0};
 static struct dev_timing opencldev_timing_per_cpu[STARPU_MAXNODES*STARPU_MAXCPUS];
 static struct dev_timing opencldev_timing_per_cpu[STARPU_MAXNODES*STARPU_MAXCPUS];
 #endif
 #endif
 
 
+#ifdef STARPU_USE_MIC
+static double mic_time_host_to_device[STARPU_MAXNODES] = {0.0};
+static double mic_time_device_to_host[STARPU_MAXNODES] = {0.0};
+#endif /* STARPU_USE_MIC */
+
 #ifdef STARPU_HAVE_HWLOC
 #ifdef STARPU_HAVE_HWLOC
 static hwloc_topology_t hwtopology;
 static hwloc_topology_t hwtopology;
 #endif
 #endif
@@ -695,6 +701,19 @@ static void benchmark_all_gpu_devices(void)
 	}
 	}
 #endif
 #endif
 
 
+#ifdef STARPU_USE_MIC
+	/* TODO: implement real calibration ! For now we only put an arbitrary
+	 * value for each device during at the declaration as a bug fix, else
+	 * we get problems on heft scheduler */
+        nmic = _starpu_mic_src_get_device_count();
+
+	for (i = 0; i < STARPU_MAXNODES; i++)
+	{
+		mic_time_host_to_device[i] = 0.1;
+		mic_time_device_to_host[i] = 0.1;
+	}
+#endif /* STARPU_USE_MIC */
+
 #ifdef STARPU_HAVE_HWLOC
 #ifdef STARPU_HAVE_HWLOC
 	hwloc_set_cpubind(hwtopology, former_cpuset, HWLOC_CPUBIND_THREAD);
 	hwloc_set_cpubind(hwtopology, former_cpuset, HWLOC_CPUBIND_THREAD);
 #elif __linux__
 #elif __linux__
@@ -1082,6 +1101,9 @@ static void write_bus_latency_file_content(void)
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
         maxnode += nopencl;
         maxnode += nopencl;
 #endif
 #endif
+#ifdef STARPU_USE_MIC
+        maxnode += nmic;
+#endif
         for (src = 0; src < STARPU_MAXNODES; src++)
         for (src = 0; src < STARPU_MAXNODES; src++)
 	{
 	{
 		for (dst = 0; dst < STARPU_MAXNODES; dst++)
 		for (dst = 0; dst < STARPU_MAXNODES; dst++)
@@ -1290,6 +1312,9 @@ static void write_bus_bandwidth_file_content(void)
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
         maxnode += nopencl;
         maxnode += nopencl;
 #endif
 #endif
+#ifdef STARPU_USE_MIC
+        maxnode += nmic;
+#endif
 	for (src = 0; src < STARPU_MAXNODES; src++)
 	for (src = 0; src < STARPU_MAXNODES; src++)
 	{
 	{
 		for (dst = 0; dst < STARPU_MAXNODES; dst++)
 		for (dst = 0; dst < STARPU_MAXNODES; dst++)
@@ -1300,7 +1325,7 @@ static void write_bus_bandwidth_file_content(void)
 			{
 			{
 				bandwidth = NAN;
 				bandwidth = NAN;
 			}
 			}
-#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
+#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL) || defined(STARPU_USE_MIC)
 			else if (src != dst)
 			else if (src != dst)
 			{
 			{
 				double slowness = 0.0;
 				double slowness = 0.0;
@@ -1320,11 +1345,17 @@ static void write_bus_bandwidth_file_content(void)
 				}
 				}
 #endif
 #endif
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
-				if (src > ncuda)
+				if (src > ncuda && src <= ncuda + nopencl)
 					slowness += opencldev_timing_dtoh[src-ncuda];
 					slowness += opencldev_timing_dtoh[src-ncuda];
-				if (dst > ncuda)
+				if (dst > ncuda && dst <= ncuda + nopencl)
 					slowness += opencldev_timing_htod[dst-ncuda];
 					slowness += opencldev_timing_htod[dst-ncuda];
 #endif
 #endif
+#ifdef STARPU_USE_MIC
+				if (src > ncuda + nopencl)
+					slowness += mic_time_device_to_host[src - (ncuda + nopencl)];
+				if (dst > ncuda + nopencl)
+					slowness += mic_time_host_to_device[dst - (ncuda + nopencl)];
+#endif
 				bandwidth = 1.0/slowness;
 				bandwidth = 1.0/slowness;
 			}
 			}
 #endif
 #endif
@@ -1364,6 +1395,9 @@ void starpu_bus_print_bandwidth(FILE *f)
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
         maxnode += nopencl;
         maxnode += nopencl;
 #endif
 #endif
+#ifdef STARPU_USE_MIC
+        maxnode += nmic;
+#endif
 
 
 	fprintf(f, "from/to\t");
 	fprintf(f, "from/to\t");
 	fprintf(f, "RAM\t");
 	fprintf(f, "RAM\t");
@@ -1501,7 +1535,7 @@ static void check_bus_config_file(void)
 	{
 	{
                 FILE *f;
                 FILE *f;
                 int ret;
                 int ret;
-		unsigned read_cuda = -1, read_opencl = -1;
+		unsigned read_cuda = -1, read_opencl = -1, read_mic = -1;
                 unsigned read_cpus = -1;
                 unsigned read_cpus = -1;
 
 
                 // Loading configuration from file
                 // Loading configuration from file
@@ -1517,6 +1551,9 @@ static void check_bus_config_file(void)
 		ret = fscanf(f, "%d\t", &read_opencl);
 		ret = fscanf(f, "%d\t", &read_opencl);
 		STARPU_ASSERT(ret == 1);
 		STARPU_ASSERT(ret == 1);
                 _starpu_drop_comments(f);
                 _starpu_drop_comments(f);
+		ret = fscanf(f, "%d\t", &read_mic);
+		STARPU_ASSERT(ret == 1);
+                _starpu_drop_comments(f);
                 fclose(f);
                 fclose(f);
 
 
                 // Loading current configuration
                 // Loading current configuration
@@ -1527,6 +1564,9 @@ static void check_bus_config_file(void)
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
                 nopencl = _starpu_opencl_get_device_count();
                 nopencl = _starpu_opencl_get_device_count();
 #endif
 #endif
+#ifdef STARPU_USE_MIC
+                nmic = _starpu_mic_src_get_device_count();
+#endif /* STARPU_USE_MIC */
 
 
                 // Checking if both configurations match
                 // Checking if both configurations match
                 if (read_cpus != ncpus)
                 if (read_cpus != ncpus)
@@ -1547,6 +1587,12 @@ static void check_bus_config_file(void)
                         _starpu_bus_force_sampling();
                         _starpu_bus_force_sampling();
 			_STARPU_DISP("... done\n");
 			_STARPU_DISP("... done\n");
                 }
                 }
+                else if (read_mic != nmic)
+		{
+                        _STARPU_DISP("Current configuration does not match the bus performance model (MIC: (stored) %d != (current) %d), recalibrating...", read_mic, nmic);
+                        _starpu_bus_force_sampling();
+			_STARPU_DISP("... done\n");
+                }
         }
         }
 }
 }
 
 
@@ -1567,6 +1613,7 @@ static void write_bus_config_file_content(void)
         fprintf(f, "%u # Number of CPUs\n", ncpus);
         fprintf(f, "%u # Number of CPUs\n", ncpus);
         fprintf(f, "%d # Number of CUDA devices\n", ncuda);
         fprintf(f, "%d # Number of CUDA devices\n", ncuda);
         fprintf(f, "%d # Number of OpenCL devices\n", nopencl);
         fprintf(f, "%d # Number of OpenCL devices\n", nopencl);
+        fprintf(f, "%d # Number of MIC devices\n", nmic);
 
 
         fclose(f);
         fclose(f);
 }
 }

+ 44 - 16
src/core/perfmodel/perfmodel_history.c

@@ -366,6 +366,21 @@ static void parse_model_file(FILE *f, struct starpu_perfmodel *model, unsigned s
 			   archmin + STARPU_MIN(narchs, STARPU_MAXOPENCLDEVS),
 			   archmin + STARPU_MIN(narchs, STARPU_MAXOPENCLDEVS),
 			   narchs > STARPU_MAXOPENCLDEVS ? narchs - STARPU_MAXOPENCLDEVS : 0);
 			   narchs > STARPU_MAXOPENCLDEVS ? narchs - STARPU_MAXOPENCLDEVS : 0);
 	}
 	}
+
+	/* Parsing MIC devs */
+	_starpu_drop_comments(f);
+	ret = fscanf(f, "%u\n", &narchs);
+	STARPU_ASSERT(ret == 1);
+
+	archmin += STARPU_MAXOPENCLDEVS;
+	_STARPU_DEBUG("Parsing %u MIC devices\n", narchs);
+	if (narchs > 0)
+	{
+		parse_arch(f, model, scan_history,
+			   archmin,
+			   archmin + STARPU_MIN(narchs, STARPU_MAXMICDEVS),
+			   narchs > STARPU_MAXMICDEVS ? narchs - STARPU_MAXMICDEVS : 0);
+	}
 }
 }
 
 
 
 
@@ -447,6 +462,7 @@ static void dump_model_file(FILE *f, struct starpu_perfmodel *model)
 		{
 		{
 			case STARPU_CUDA_DEFAULT:
 			case STARPU_CUDA_DEFAULT:
 			case STARPU_OPENCL_DEFAULT:
 			case STARPU_OPENCL_DEFAULT:
+			case STARPU_MIC_DEFAULT:
 				arch_base = arch;
 				arch_base = arch;
 				idx++;
 				idx++;
 				break;
 				break;
@@ -479,42 +495,48 @@ static void dump_model_file(FILE *f, struct starpu_perfmodel *model)
 	}
 	}
 
 
 	/* Writing stuff */
 	/* Writing stuff */
+
 	char *name = "unknown";
 	char *name = "unknown";
 	unsigned substract_to_arch = 0;
 	unsigned substract_to_arch = 0;
 	for (arch = 0; arch < STARPU_NARCH_VARIATIONS; arch++)
 	for (arch = 0; arch < STARPU_NARCH_VARIATIONS; arch++)
 	{
 	{
+		unsigned char arch_already_visited = 0;
+
 		switch (arch)
 		switch (arch)
 		{
 		{
 			case STARPU_CPU_DEFAULT:
 			case STARPU_CPU_DEFAULT:
-				arch_base = arch;
 				name = "CPU";
 				name = "CPU";
-				fprintf(f, "##################\n");
-				fprintf(f, "# %ss\n", name);
-				fprintf(f, "# maximum number of %ss\n", name);
-				fprintf(f, "%u\n", my_narch = narch[0]);
+				my_narch = narch[0];
 				break;
 				break;
 			case STARPU_CUDA_DEFAULT:
 			case STARPU_CUDA_DEFAULT:
-				arch_base = arch;
 				name = "CUDA";
 				name = "CUDA";
 				substract_to_arch = STARPU_MAXCPUS;
 				substract_to_arch = STARPU_MAXCPUS;
-				fprintf(f, "##################\n");
-				fprintf(f, "# %ss\n", name);
-				fprintf(f, "# number of %s architectures\n", name);
-				fprintf(f, "%u\n", my_narch = narch[1]);
+				my_narch = narch[1];
 				break;
 				break;
 			case STARPU_OPENCL_DEFAULT:
 			case STARPU_OPENCL_DEFAULT:
-				arch_base = arch;
 				name = "OPENCL";
 				name = "OPENCL";
-				substract_to_arch += STARPU_MAXCUDADEVS;
-				fprintf(f, "##################\n");
-				fprintf(f, "# %ss\n", name);
-				fprintf(f, "# number of %s architectures\n", name);
-				fprintf(f, "%u\n", my_narch = narch[2]);
+				my_narch = narch[2];
+				break;
+			case STARPU_MIC_DEFAULT:
+				name = "MIC";
+				my_narch = narch[3];
 				break;
 				break;
 			default:
 			default:
+				/* The current worker arch was already written,
+				 * we don't need to write it again */
+				arch_already_visited = 1;
 				break;
 				break;
 		}
 		}
 
 
+		if (!arch_already_visited)
+		{
+			arch_base = arch;
+			fprintf(f, "##################\n");
+			fprintf(f, "# %ss\n", name);
+			fprintf(f, "# number of %s architectures\n", name);
+			fprintf(f, "%u\n", my_narch);
+		}
+
 		unsigned max_impl = 0;
 		unsigned max_impl = 0;
 		if (model->type == STARPU_HISTORY_BASED || model->type == STARPU_NL_REGRESSION_BASED)
 		if (model->type == STARPU_HISTORY_BASED || model->type == STARPU_NL_REGRESSION_BASED)
 		{
 		{
@@ -1024,6 +1046,12 @@ void starpu_perfmodel_get_arch_name(enum starpu_perfmodel_archtype arch, char *a
 		int devid = arch - STARPU_OPENCL_DEFAULT;
 		int devid = arch - STARPU_OPENCL_DEFAULT;
 		snprintf(archname, maxlen, "opencl_%d_impl_%u", devid,nimpl);
 		snprintf(archname, maxlen, "opencl_%d_impl_%u", devid,nimpl);
 	}
 	}
+	else if ((STARPU_MIC_DEFAULT <= arch)
+		&& (arch < STARPU_MIC_DEFAULT + STARPU_MAXMICDEVS))
+	{
+		int devid = arch - STARPU_MIC_DEFAULT;
+		snprintf(archname, maxlen, "mic_%d_impl_%u", devid, nimpl);
+	}
 	else
 	else
 	{
 	{
 		STARPU_ABORT();
 		STARPU_ABORT();

+ 28 - 0
src/core/sched_policy.c

@@ -486,6 +486,20 @@ struct starpu_task *_starpu_create_conversion_task_for_arch(starpu_data_handle_t
 			break;
 			break;
 		}
 		}
 #endif
 #endif
+#ifdef STARPU_USE_MIC
+		case STARPU_MIC_RAM:
+			struct starpu_multiformat_data_interface_ops *mf_ops;
+			mf_ops = (struct starpu_multiformat_data_interface_ops *) handle->ops->get_mf_ops(format_interface);
+			conversion_task->cl = mf_ops->mic_to_cpu_cl;
+			break;
+#endif
+#ifdef STARPU_USE_MIC
+		case STARPU_SCC_RAM:
+			struct starpu_multiformat_data_interface_ops *mf_ops;
+			mf_ops = (struct starpu_multiformat_data_interface_ops *) handle->ops->get_mf_ops(format_interface);
+			conversion_task->cl = mf_ops->scc_to_cpu_cl;
+			break;
+#endif
 		default:
 		default:
 			_STARPU_ERROR("Oops : %u\n", handle->mf_node);
 			_STARPU_ERROR("Oops : %u\n", handle->mf_node);
 		}
 		}
@@ -508,6 +522,20 @@ struct starpu_task *_starpu_create_conversion_task_for_arch(starpu_data_handle_t
 		break;
 		break;
 	}
 	}
 #endif
 #endif
+#ifdef STARPU_USE_MIC
+	case STARPU_MIC_RAM:
+		struct starpu_multiformat_data_interface_ops *mf_ops;
+		mf_ops = (struct starpu_multiformat_data_interface_ops *) handle->ops->get_mf_ops(format_interface);
+		conversion_task->cl = mf_ops->cpu_to_mic_cl;
+		break;
+#endif
+#ifdef STARPU_USE_SCC
+	case STARPU_SCC_RAM:
+		struct starpu_multiformat_data_interface_ops *mf_ops;
+		mf_ops = (struct starpu_multiformat_data_interface_ops *) handle->ops->get_mf_ops(format_interface);
+		conversion_task->cl = mf_ops->cpu_to_scc_cl;
+		break;
+#endif
 	default:
 	default:
 		STARPU_ABORT();
 		STARPU_ABORT();
 	}
 	}

+ 26 - 0
src/core/task.c

@@ -155,6 +155,11 @@ void _starpu_task_destroy(struct starpu_task *task)
 		starpu_task_clean(task);
 		starpu_task_clean(task);
 		/* TODO handle the case of task with detach = 1 and destroy = 1 */
 		/* TODO handle the case of task with detach = 1 and destroy = 1 */
 		/* TODO handle the case of non terminated tasks -> return -EINVAL */
 		/* TODO handle the case of non terminated tasks -> return -EINVAL */
+
+		/* Does user want StarPU release cl_arg ? */
+		if (task->cl_arg_free)
+			free(task->cl_arg);
+
 		free(task);
 		free(task);
 	}
 	}
 }
 }
@@ -871,6 +876,8 @@ _starpu_handle_needs_conversion_task_for_arch(starpu_data_handle_t handle,
 					return 0;
 					return 0;
 				case STARPU_CUDA_RAM:      /* Fall through */
 				case STARPU_CUDA_RAM:      /* Fall through */
 				case STARPU_OPENCL_RAM:
 				case STARPU_OPENCL_RAM:
+				case STARPU_MIC_RAM:
+				case STARPU_SCC_RAM:
 					return 1;
 					return 1;
 				default:
 				default:
 					STARPU_ABORT();
 					STARPU_ABORT();
@@ -878,12 +885,16 @@ _starpu_handle_needs_conversion_task_for_arch(starpu_data_handle_t handle,
 			break;
 			break;
 		case STARPU_CUDA_RAM:    /* Fall through */
 		case STARPU_CUDA_RAM:    /* Fall through */
 		case STARPU_OPENCL_RAM:
 		case STARPU_OPENCL_RAM:
+		case STARPU_MIC_RAM:
+		case STARPU_SCC_RAM:
 			switch(starpu_node_get_kind(handle->mf_node))
 			switch(starpu_node_get_kind(handle->mf_node))
 			{
 			{
 				case STARPU_CPU_RAM:
 				case STARPU_CPU_RAM:
 					return 1;
 					return 1;
 				case STARPU_CUDA_RAM:
 				case STARPU_CUDA_RAM:
 				case STARPU_OPENCL_RAM:
 				case STARPU_OPENCL_RAM:
+				case STARPU_MIC_RAM:
+				case STARPU_SCC_RAM:
 					return 0;
 					return 0;
 				default:
 				default:
 					STARPU_ABORT();
 					STARPU_ABORT();
@@ -920,3 +931,18 @@ unsigned starpu_task_get_implementation(struct starpu_task *task)
 {
 {
 	return _starpu_get_job_associated_to_task(task)->nimpl;
 	return _starpu_get_job_associated_to_task(task)->nimpl;
 }
 }
+
+starpu_mic_func_t _starpu_task_get_mic_nth_implementation(struct starpu_codelet *cl, unsigned nimpl)
+{
+	return cl->mic_funcs[nimpl];
+}
+
+starpu_scc_func_t _starpu_task_get_scc_nth_implementation(struct starpu_codelet *cl, unsigned nimpl)
+{
+	return cl->scc_funcs[nimpl];
+}
+
+char *_starpu_task_get_cpu_name_nth_implementation(struct starpu_codelet *cl, unsigned nimpl)
+{
+	return cl->cpu_funcs_name[nimpl];
+}

+ 4 - 0
src/core/task.h

@@ -72,6 +72,10 @@ void _starpu_codelet_check_deprecated_fields(struct starpu_codelet *cl);
 starpu_cpu_func_t _starpu_task_get_cpu_nth_implementation(struct starpu_codelet *cl, unsigned nimpl);
 starpu_cpu_func_t _starpu_task_get_cpu_nth_implementation(struct starpu_codelet *cl, unsigned nimpl);
 starpu_cuda_func_t _starpu_task_get_cuda_nth_implementation(struct starpu_codelet *cl, unsigned nimpl);
 starpu_cuda_func_t _starpu_task_get_cuda_nth_implementation(struct starpu_codelet *cl, unsigned nimpl);
 starpu_opencl_func_t _starpu_task_get_opencl_nth_implementation(struct starpu_codelet *cl, unsigned nimpl);
 starpu_opencl_func_t _starpu_task_get_opencl_nth_implementation(struct starpu_codelet *cl, unsigned nimpl);
+starpu_mic_func_t _starpu_task_get_mic_nth_implementation(struct starpu_codelet *cl, unsigned nimpl);
+starpu_scc_func_t _starpu_task_get_scc_nth_implementation(struct starpu_codelet *cl, unsigned nimpl);
+
+char *_starpu_task_get_cpu_name_nth_implementation(struct starpu_codelet *cl, unsigned nimpl);
 
 
 #define _STARPU_TASK_SET_INTERFACE(task, interface, i) do { if (task->dyn_handles) task->dyn_interfaces[i] = interface; else task->interfaces[i] = interface;} while(0)
 #define _STARPU_TASK_SET_INTERFACE(task, interface, i) do { if (task->dyn_handles) task->dyn_interfaces[i] = interface; else task->interfaces[i] = interface;} while(0)
 #define _STARPU_TASK_GET_INTERFACES(task) ((task->dyn_handles) ? task->dyn_interfaces : task->interfaces)
 #define _STARPU_TASK_GET_INTERFACES(task) ((task->dyn_handles) ? task->dyn_interfaces : task->interfaces)

+ 389 - 13
src/core/topology.c

@@ -23,6 +23,9 @@
 #include <core/debug.h>
 #include <core/debug.h>
 #include <core/topology.h>
 #include <core/topology.h>
 #include <drivers/cuda/driver_cuda.h>
 #include <drivers/cuda/driver_cuda.h>
+#include <drivers/mic/driver_mic_source.h>
+#include <drivers/scc/driver_scc_source.h>
+#include <drivers/mp_common/source_common.h>
 #include <drivers/opencl/driver_opencl.h>
 #include <drivers/opencl/driver_opencl.h>
 #include <profiling/profiling.h>
 #include <profiling/profiling.h>
 #include <common/uthash.h>
 #include <common/uthash.h>
@@ -45,7 +48,7 @@
 
 
 static unsigned topology_is_initialized = 0;
 static unsigned topology_is_initialized = 0;
 
 
-#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID)
+#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL) || defined(STARPU_USE_MIC) || defined(STARPU_USE_SCC) || defined(STARPU_SIMGRID)
 
 
 struct handle_entry
 struct handle_entry
 {
 {
@@ -67,9 +70,9 @@ static unsigned may_bind_automatically = 0;
  * Discover the topology of the machine
  * Discover the topology of the machine
  */
  */
 
 
-#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID)
+#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL) || defined(STARPU_USE_MIC) || defined(STARPU_USE_SCC)  || defined(STARPU_SIMGRID)
 static void
 static void
-_starpu_initialize_workers_gpuid (int *explicit_workers_gpuid,
+_starpu_initialize_workers_deviceid (int *explicit_workers_gpuid,
 				  int *current, int *workers_gpuid,
 				  int *current, int *workers_gpuid,
 				  const char *varname, unsigned nhwgpus)
 				  const char *varname, unsigned nhwgpus)
 {
 {
@@ -144,7 +147,8 @@ _starpu_initialize_workers_gpuid (int *explicit_workers_gpuid,
 			  workers_gpuid[i] = (unsigned)(i % nhwgpus);
 			  workers_gpuid[i] = (unsigned)(i % nhwgpus);
 
 
 		/* StarPU can use sampling techniques to bind threads
 		/* StarPU can use sampling techniques to bind threads
-		 * correctly */
+		 * correctly
+		 * TODO: use a private value for each kind of device */
 		may_bind_automatically = 1;
 		may_bind_automatically = 1;
 	}
 	}
 }
 }
@@ -157,7 +161,7 @@ _starpu_initialize_workers_cuda_gpuid (struct _starpu_machine_config *config)
 	struct starpu_machine_topology *topology = &config->topology;
 	struct starpu_machine_topology *topology = &config->topology;
 	struct starpu_conf *uconf = config->conf;
 	struct starpu_conf *uconf = config->conf;
 
 
-        _starpu_initialize_workers_gpuid (
+        _starpu_initialize_workers_deviceid (
 		uconf->use_explicit_workers_cuda_gpuid == 0
 		uconf->use_explicit_workers_cuda_gpuid == 0
 		? NULL
 		? NULL
 		: (int *)uconf->workers_cuda_gpuid,
 		: (int *)uconf->workers_cuda_gpuid,
@@ -184,7 +188,7 @@ _starpu_initialize_workers_opencl_gpuid (struct _starpu_machine_config*config)
 	struct starpu_machine_topology *topology = &config->topology;
 	struct starpu_machine_topology *topology = &config->topology;
 	struct starpu_conf *uconf = config->conf;
 	struct starpu_conf *uconf = config->conf;
 
 
-        _starpu_initialize_workers_gpuid(
+        _starpu_initialize_workers_deviceid(
 		uconf->use_explicit_workers_opencl_gpuid == 0
 		uconf->use_explicit_workers_opencl_gpuid == 0
 		? NULL
 		? NULL
 		: (int *)uconf->workers_opencl_gpuid,
 		: (int *)uconf->workers_opencl_gpuid,
@@ -258,6 +262,145 @@ _starpu_get_next_opencl_gpuid (struct _starpu_machine_config *config)
 }
 }
 #endif
 #endif
 
 
+#if 0
+#if defined(STARPU_USE_MIC) || defined(STARPU_SIMGRID)
+static void _starpu_initialize_workers_mic_deviceid(struct _starpu_machine_config *config)
+{
+	struct starpu_machine_topology *topology = &config->topology;
+	struct starpu_conf *uconf = config->conf;
+
+	_starpu_initialize_workers_deviceid(
+		uconf->use_explicit_workers_mic_deviceid == 0
+		? NULL
+		: (int *)config->user_conf->workers_mic_deviceid,
+		&(config->current_mic_deviceid),
+		"STARPU_WORKERS_MICID",
+		topology->nhwmiccores);
+}
+#endif
+#endif
+
+#ifdef STARPU_USE_SCC
+static void _starpu_initialize_workers_scc_deviceid(struct _starpu_machine_config *config)
+{
+	struct starpu_machine_topology *topology = &config->topology;
+	struct starpu_conf *uconf = config->conf;
+
+	_starpu_initialize_workers_deviceid(
+		uconf->use_explicit_workers_scc_deviceid == 0
+		? NULL
+		: (int *) uconf->workers_scc_deviceid,
+		&(config->current_scc_deviceid),
+		"STARPU_WORKERS_SCCID",
+		topology->nhwscc);
+}
+#endif /* STARPU_USE_SCC */
+
+#if 0
+#ifdef STARPU_USE_MIC
+static inline int _starpu_get_next_mic_deviceid(struct _starpu_machine_config *config)
+{
+	unsigned i = ((config->current_mic_deviceid++) % config->topology.nmicdevices);
+
+	return (int)config->topology.workers_mic_deviceid[i];
+}
+#endif
+#endif
+
+#ifdef STARPU_USE_SCC
+static inline int _starpu_get_next_scc_deviceid(struct _starpu_machine_config *config)
+{
+	unsigned i = ((config->current_scc_deviceid++) % config->topology.nsccdevices);
+
+	return (int)config->topology.workers_scc_deviceid[i];
+}
+#endif
+
+#ifdef STARPU_USE_MIC
+static void
+_starpu_init_mic_topology (struct _starpu_machine_config *config, long mic_idx)
+{
+    /* Discover the topology of the mic node identifier by MIC_IDX. That
+     * means, make this StarPU instance aware of the number of cores available
+     * on this MIC device. Update the `nhwmiccores' topology field
+     * accordingly. */
+
+    struct starpu_machine_topology *topology = &config->topology;
+
+    int nbcores;
+    _starpu_src_common_sink_nbcores (mic_nodes[mic_idx], &nbcores);
+    topology->nhwmiccores[mic_idx] = nbcores;
+}
+
+
+static int
+_starpu_init_mic_node (struct _starpu_machine_config *config, int mic_idx,
+		       COIENGINE *coi_handle, COIPROCESS *coi_process)
+{
+    /* Initialize the MIC node of index MIC_IDX. */
+
+    struct starpu_conf *user_conf = config->user_conf;
+
+    char ***argv = _starpu_get_argv();
+    const char *suffixes[] = {"-mic", "_mic", NULL};
+
+    /* Environment variables to send to the Sink, it informs it what kind
+     * of node it is (architecture and type) as there is no way to discover
+     * it itself */
+    char mic_idx_env[32];
+    sprintf(mic_idx_env, "DEVID=%d", mic_idx);
+
+    /* XXX: this is currently necessary so that the remote process does not
+     * segfault. */
+    char nb_mic_env[32];
+    sprintf(nb_mic_env, "NB_MIC=%d", 2);
+
+    const char *mic_sink_env[] = {"STARPU_SINK=STARPU_MIC", mic_idx_env, nb_mic_env, NULL};
+
+    char mic_sink_program_path[1024];
+    /* Let's get the helper program to run on the MIC device */
+    int mic_file_found =
+	_starpu_src_common_locate_file (mic_sink_program_path,
+					getenv("STARPU_MIC_SINK_PROGRAM_NAME"),
+					getenv("STARPU_MIC_SINK_PROGRAM_PATH"),
+					(user_conf==NULL ? NULL : user_conf->mic_sink_program_path),
+					(argv ? (*argv)[0] : NULL),
+					suffixes);
+
+    if (0 != mic_file_found) {
+	fprintf(stderr, "No MIC program specified, use the environment"
+		"variable STARPU_MIC_SINK_PROGRAM_NAME or the environment"
+		"or the field 'starpu_conf.mic_sink_program_path'"
+		"to define it.\n");
+
+	return -1;
+    }
+
+    COIRESULT res;
+    /* Let's get the handle which let us manage the remote MIC device */
+    res = COIEngineGetHandle(COI_ISA_MIC, mic_idx, coi_handle);
+    if (STARPU_UNLIKELY(res != COI_SUCCESS))
+	STARPU_MIC_SRC_REPORT_COI_ERROR(res);
+
+    /* We launch the helper on the MIC device, which will wait for us
+     * to give it work to do.
+     * As we will communicate further with the device throught scif we
+     * don't need to keep the process pointer */
+    res = COIProcessCreateFromFile(*coi_handle, mic_sink_program_path, 0, NULL, 0,
+				   mic_sink_env, 1, NULL, 0, NULL,
+				   coi_process);
+    if (STARPU_UNLIKELY(res != COI_SUCCESS))
+	STARPU_MIC_SRC_REPORT_COI_ERROR(res);
+
+    /* Let's create the node structure, we'll communicate with the peer
+     * through scif thanks to it */
+    mic_nodes[mic_idx] =
+	_starpu_mp_common_node_create(STARPU_MIC_SOURCE, mic_idx);
+
+    return 0;
+}
+#endif
+
 
 
 static void
 static void
 _starpu_init_topology (struct _starpu_machine_config *config)
 _starpu_init_topology (struct _starpu_machine_config *config)
@@ -284,6 +427,9 @@ _starpu_init_topology (struct _starpu_machine_config *config)
 	_starpu_cpu_discover_devices(config);
 	_starpu_cpu_discover_devices(config);
 	_starpu_cuda_discover_devices(config);
 	_starpu_cuda_discover_devices(config);
 	_starpu_opencl_discover_devices(config);
 	_starpu_opencl_discover_devices(config);
+#ifdef STARPU_USE_SCC
+	config->topology.nhwscc = _starpu_scc_src_get_device_count();
+#endif
 
 
 	topology_is_initialized = 1;
 	topology_is_initialized = 1;
 }
 }
@@ -434,8 +580,109 @@ _starpu_topology_get_nhwcpu (struct _starpu_machine_config *config)
 	return config->topology.nhwcpus;
 	return config->topology.nhwcpus;
 }
 }
 
 
+#ifdef STARPU_USE_MIC
+static void
+_starpu_init_mic_config (struct _starpu_machine_config *config,
+			 struct starpu_conf *user_conf,
+			 unsigned mic_idx)
+{
+    // Configure the MIC device of index MIC_IDX.
+
+    struct starpu_machine_topology *topology = &config->topology;
+
+    topology->nhwmiccores[mic_idx] = 0;
+
+    _starpu_init_mic_topology (config, mic_idx);
+
+    int nmiccores;
+    nmiccores = starpu_get_env_number("STARPU_NMIC");
+
+    /* STARPU_NMIC is not set. Did the user specify anything ? */
+    if (nmiccores == -1 && user_conf)
+	nmiccores = user_conf->nmic;
+
+    if (nmiccores != 0)
+    {
+	if (nmiccores == -1)
+	{
+	    /* Nothing was specified, so let's use the number of
+	     * detected mic cores. ! */
+	    nmiccores = topology->nhwmiccores[mic_idx];
+	}
+	else
+	{
+	    if (nmiccores > topology->nhwmiccores[mic_idx])
+	    {
+		/* The user requires more MIC devices than there is available */
+		fprintf(stderr,
+			"# Warning: %d MIC devices requested. Only %d available.\n",
+			nmiccores, topology->nhwmiccores[mic_idx]);
+		nmiccores = topology->nhwmiccores[mic_idx];
+	    }
+	}
+    }
+
+    topology->nmiccores[mic_idx] = nmiccores;
+    STARPU_ASSERT(topology->nmiccores[mic_idx] + topology->nworkers <= STARPU_NMAXWORKERS);
+
+    /* _starpu_initialize_workers_mic_deviceid (config); */
+
+    unsigned miccore_id;
+    for (miccore_id = 0; miccore_id < topology->nmiccores[mic_idx]; miccore_id++)
+    {
+	config->workers[topology->nworkers + miccore_id].arch = STARPU_MIC_WORKER;
+	config->workers[topology->nworkers + miccore_id].perf_arch = STARPU_MIC_DEFAULT;
+	config->workers[topology->nworkers + miccore_id].mp_nodeid = mic_idx;
+	config->workers[topology->nworkers + miccore_id].devid = miccore_id;
+	config->workers[topology->nworkers + miccore_id].worker_mask = STARPU_MIC;
+	config->worker_mask |= STARPU_MIC;
+    }
+
+    topology->nworkers += topology->nmiccores[mic_idx];
+}
+
+
+static void
+_starpu_init_mp_config (struct _starpu_machine_config *config,
+			struct starpu_conf *user_conf)
+{
+    /* Discover and configure the mp topology. That means:
+     * - discover the number of mp nodes;
+     * - initialize each discovered node;
+     * - discover the local topology (number of PUs/devices) of each node;
+     * - configure the workers accordingly.
+     */
+
+    struct starpu_machine_topology *topology = &config->topology;
+
+    // We currently only support MIC at this level.
+#ifdef STARPU_USE_MIC
+    static COIENGINE handles[2];
+    static COIPROCESS process[2];
+
+    /* Discover and initialize the number of MIC nodes through the mp
+     * infrastructure. */
+    unsigned nhwmicdevices = _starpu_mic_src_get_device_count();
+
+    int reqmicdevices = starpu_get_env_number("STARPU_NMICDEVS");
+    if (-1 == reqmicdevices)
+	reqmicdevices = nhwmicdevices;
+
+    topology->nmicdevices = 0;
+    unsigned i;
+    for (i = 0; i < STARPU_MIN (nhwmicdevices, reqmicdevices); i++)
+	if (0 == _starpu_init_mic_node (config, i, &handles[i], &process[i]))
+	    topology->nmicdevices++;
+
+    i = 0;
+    for (; i < topology->nmicdevices; i++)
+	_starpu_init_mic_config (config, user_conf, i);
+#endif
+}
+#endif
+
 static int
 static int
-_starpu_init_machine_config (struct _starpu_machine_config *config)
+_starpu_init_machine_config (struct _starpu_machine_config *config, int no_mp_config)
 {
 {
 	int i;
 	int i;
 	for (i = 0; i < STARPU_NMAXWORKERS; i++)
 	for (i = 0; i < STARPU_NMAXWORKERS; i++)
@@ -498,6 +745,7 @@ _starpu_init_machine_config (struct _starpu_machine_config *config)
 		int devid = _starpu_get_next_cuda_gpuid(config);
 		int devid = _starpu_get_next_cuda_gpuid(config);
 		enum starpu_perfmodel_archtype arch =
 		enum starpu_perfmodel_archtype arch =
 			(enum starpu_perfmodel_archtype)((int)STARPU_CUDA_DEFAULT + devid);
 			(enum starpu_perfmodel_archtype)((int)STARPU_CUDA_DEFAULT + devid);
+		config->workers[worker_idx].mp_nodeid = -1;
 		config->workers[worker_idx].devid = devid;
 		config->workers[worker_idx].devid = devid;
 		config->workers[worker_idx].perf_arch = arch;
 		config->workers[worker_idx].perf_arch = arch;
 		config->workers[worker_idx].worker_mask = STARPU_CUDA;
 		config->workers[worker_idx].worker_mask = STARPU_CUDA;
@@ -572,6 +820,7 @@ _starpu_init_machine_config (struct _starpu_machine_config *config)
 		config->workers[worker_idx].arch = STARPU_OPENCL_WORKER;
 		config->workers[worker_idx].arch = STARPU_OPENCL_WORKER;
 		enum starpu_perfmodel_archtype arch =
 		enum starpu_perfmodel_archtype arch =
 			(enum starpu_perfmodel_archtype)((int)STARPU_OPENCL_DEFAULT + devid);
 			(enum starpu_perfmodel_archtype)((int)STARPU_OPENCL_DEFAULT + devid);
+		config->workers[worker_idx].mp_nodeid = -1;
 		config->workers[worker_idx].devid = devid;
 		config->workers[worker_idx].devid = devid;
 		config->workers[worker_idx].perf_arch = arch;
 		config->workers[worker_idx].perf_arch = arch;
 		config->workers[worker_idx].worker_mask = STARPU_OPENCL;
 		config->workers[worker_idx].worker_mask = STARPU_OPENCL;
@@ -582,6 +831,78 @@ _starpu_init_machine_config (struct _starpu_machine_config *config)
 	topology->nworkers += topology->nopenclgpus;
 	topology->nworkers += topology->nopenclgpus;
 #endif
 #endif
 
 
+#ifdef STARPU_USE_SCC
+	int nscc = config->conf->nscc;
+
+	unsigned nb_scc_nodes = _starpu_scc_src_get_device_count();
+
+	if (nscc != 0)
+	{
+		/* The user did not disable SCC. We need to count
+		 * the number of devices */
+		int nb_devices = nb_scc_nodes;
+
+		if (nscc == -1)
+		{
+			/* Nothing was specified, so let's choose ! */
+			nscc = nb_devices;
+			if (nscc > STARPU_MAXSCCDEVS)
+			{
+				_STARPU_DISP("Warning: %d SCC devices available. Only %d enabled. Use configuration option --enable-maxsccdev=xxx to update the maximum value of supported SCC devices.\n", nb_devices, STARPU_MAXSCCDEVS);
+				nscc = STARPU_MAXSCCDEVS;
+			}
+		}
+		else
+		{
+			/* Let's make sure this value is OK. */
+			if (nscc > nb_devices)
+			{
+				/* The user requires more SCC devices than there is available */
+				_STARPU_DISP("Warning: %d SCC devices requested. Only %d available.\n", nscc, nb_devices);
+				nscc = nb_devices;
+			}
+			/* Let's make sure this value is OK. */
+			if (nscc > STARPU_MAXSCCDEVS)
+			{
+				_STARPU_DISP("Warning: %d SCC devices requested. Only %d enabled. Use configure option --enable-maxsccdev=xxx to update the maximum value of supported SCC devices.\n", nscc, STARPU_MAXSCCDEVS);
+				nscc = STARPU_MAXSCCDEVS;
+			}
+		}
+	}
+
+	/* Now we know how many SCC devices will be used */
+	topology->nsccdevices = nscc;
+	STARPU_ASSERT(topology->nsccdevices + topology->nworkers <= STARPU_NMAXWORKERS);
+
+	_starpu_initialize_workers_scc_deviceid(config);
+
+	unsigned sccdev;
+	for (sccdev = 0; sccdev < topology->nsccdevices; sccdev++)
+	{
+		config->workers[topology->nworkers + sccdev].arch = STARPU_SCC_WORKER;
+		int devid = _starpu_get_next_scc_deviceid(config);
+		enum starpu_perf_archtype arch = (enum starpu_perf_archtype)((int)STARPU_SCC_DEFAULT + devid);
+		config->workers[topology->nworkers + sccdev].mp_nodeid = -1;
+		config->workers[topology->nworkers + sccdev].devid = devid;
+		config->workers[topology->nworkers + sccdev].perf_arch = arch;
+		config->workers[topology->nworkers + sccdev].worker_mask = STARPU_SCC;
+		config->worker_mask |= STARPU_SCC;
+	}
+
+	for (; sccdev < nb_scc_nodes; ++sccdev)
+		_starpu_scc_exit_useless_node(sccdev);
+
+	topology->nworkers += topology->nsccdevices;
+#endif /* STARPU_USE_SCC */
+
+
+	/* Unless not requested, we need to complete configuration with the
+	 * ones of the mp nodes. */
+#ifdef STARPU_USE_MIC
+	if (! no_mp_config)
+	    _starpu_init_mp_config (config, user_conf);
+#endif
+
 /* we put the CPU section after the accelerator : in case there was an
 /* we put the CPU section after the accelerator : in case there was an
  * accelerator found, we devote one cpu */
  * accelerator found, we devote one cpu */
 #if defined(STARPU_USE_CPU) || defined(STARPU_SIMGRID)
 #if defined(STARPU_USE_CPU) || defined(STARPU_SIMGRID)
@@ -591,8 +912,15 @@ _starpu_init_machine_config (struct _starpu_machine_config *config)
 	{
 	{
 		if (ncpu == -1)
 		if (ncpu == -1)
 		{
 		{
-			unsigned already_busy_cpus = topology->ncudagpus + topology->nopenclgpus;
-			long avail_cpus = topology->nhwcpus - already_busy_cpus;
+			unsigned mic_busy_cpus = 0;
+			unsigned i = 0;
+			for (i = 0; i < STARPU_MAXMICDEVS; i++)
+				mic_busy_cpus += (topology->nmiccores[i] ? 1 : 0);
+
+			unsigned already_busy_cpus = mic_busy_cpus + topology->ncudagpus
+				+ topology->nopenclgpus + topology->nsccdevices;
+
+			long avail_cpus = (long) topology->nhwcpus - (long) already_busy_cpus;
 			if (avail_cpus < 0)
 			if (avail_cpus < 0)
 				avail_cpus = 0;
 				avail_cpus = 0;
 			ncpu = STARPU_MIN(avail_cpus, STARPU_MAXCPUS);
 			ncpu = STARPU_MIN(avail_cpus, STARPU_MAXCPUS);
@@ -617,6 +945,7 @@ _starpu_init_machine_config (struct _starpu_machine_config *config)
 		int worker_idx = topology->nworkers + cpu;
 		int worker_idx = topology->nworkers + cpu;
 		config->workers[worker_idx].arch = STARPU_CPU_WORKER;
 		config->workers[worker_idx].arch = STARPU_CPU_WORKER;
 		config->workers[worker_idx].perf_arch = STARPU_CPU_DEFAULT;
 		config->workers[worker_idx].perf_arch = STARPU_CPU_DEFAULT;
+		config->workers[worker_idx].mp_nodeid = -1;
 		config->workers[worker_idx].devid = cpu;
 		config->workers[worker_idx].devid = cpu;
 		config->workers[worker_idx].worker_mask = STARPU_CPU;
 		config->workers[worker_idx].worker_mask = STARPU_CPU;
 		config->worker_mask |= STARPU_CPU;
 		config->worker_mask |= STARPU_CPU;
@@ -745,7 +1074,7 @@ _starpu_bind_thread_on_cpus (
 
 
 
 
 static void
 static void
-_starpu_init_workers_binding (struct _starpu_machine_config *config)
+_starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_config)
 {
 {
 	/* launch one thread per CPU */
 	/* launch one thread per CPU */
 	unsigned ram_memory_node;
 	unsigned ram_memory_node;
@@ -770,6 +1099,21 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config)
 	 * combinations in a matrix which we initialize here. */
 	 * combinations in a matrix which we initialize here. */
 	_starpu_initialize_busid_matrix();
 	_starpu_initialize_busid_matrix();
 
 
+#ifdef STARPU_USE_MIC
+	/* Each MIC device has its own memory node. */
+	unsigned mic_memory_nodes[STARPU_MAXMICDEVS];
+
+	// Register the memory nodes for the MIC devices.
+	if (! no_mp_config) {
+	    unsigned i = 0;
+	    for (i = 0; i < config->topology.nmicdevices; i++) {
+		mic_memory_nodes[i] = _starpu_register_memory_node (STARPU_MIC_RAM, i);
+		_starpu_register_bus(0, mic_memory_nodes[i]);
+		_starpu_register_bus(mic_memory_nodes[i], 0);
+	    }
+	}
+#endif
+
 	unsigned worker;
 	unsigned worker;
 	for (worker = 0; worker < config->topology.nworkers; worker++)
 	for (worker = 0; worker < config->topology.nworkers; worker++)
 	{
 	{
@@ -852,6 +1196,38 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config)
 				break;
 				break;
 #endif
 #endif
 
 
+#ifdef STARPU_USE_MIC
+		        case STARPU_MIC_WORKER:
+				//if (may_bind_automatically)
+				//{
+				//	/* StarPU is allowed to bind threads automatically */
+				//	preferred_binding = _starpu_get_mic_affinity_vector(workerarg->devid);
+				//	npreferred = config->topology.nhwcpus;
+				//}
+				is_a_set_of_accelerators = 1;
+				memory_node = mic_memory_nodes[workerarg->mp_nodeid];
+				_starpu_memory_node_worker_add(memory_node);
+				/* memory_node = _starpu_register_memory_node(STARPU_MIC_RAM, workerarg->devid);*/
+
+				/* _starpu_register_bus(0, memory_node);
+				 * _starpu_register_bus(memory_node, 0); */
+				break;
+#endif /* STARPU_USE_MIC */
+
+#ifdef STARPU_USE_SCC
+			case STARPU_SCC_WORKER:
+			{
+				/* Node 0 represents the SCC shared memory when we're on SCC. */
+				struct _starpu_mem_node_descr *descr = _starpu_get_memory_node_description();
+				descr->nodes[ram_memory_node] = STARPU_SCC_SHM;
+
+				is_a_set_of_accelerators = 0;
+				memory_node = ram_memory_node;
+				_starpu_memory_node_worker_add(memory_node);
+			}
+				break;
+#endif
+
 			default:
 			default:
 				STARPU_ABORT();
 				STARPU_ABORT();
 		}
 		}
@@ -902,18 +1278,18 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config)
 
 
 
 
 int
 int
-_starpu_build_topology (struct _starpu_machine_config *config)
+_starpu_build_topology (struct _starpu_machine_config *config, int no_mp_config)
 {
 {
 	int ret;
 	int ret;
 
 
-	ret = _starpu_init_machine_config(config);
+	ret = _starpu_init_machine_config(config, no_mp_config);
 	if (ret)
 	if (ret)
 		return ret;
 		return ret;
 
 
 	/* for the data management library */
 	/* for the data management library */
 	_starpu_memory_nodes_init();
 	_starpu_memory_nodes_init();
 
 
-	_starpu_init_workers_binding(config);
+	_starpu_init_workers_binding(config, no_mp_config);
 
 
 	return 0;
 	return 0;
 }
 }

+ 1 - 1
src/core/topology.h

@@ -27,7 +27,7 @@
 struct _starpu_machine_config;
 struct _starpu_machine_config;
 
 
 /* Detect the number of memory nodes and where to bind the different workers. */
 /* Detect the number of memory nodes and where to bind the different workers. */
-int _starpu_build_topology(struct _starpu_machine_config *config);
+int _starpu_build_topology(struct _starpu_machine_config *config, int no_mp_config);
 
 
 /* Destroy all resources used to store the topology of the machine. */
 /* Destroy all resources used to store the topology of the machine. */
 void _starpu_destroy_topology(struct _starpu_machine_config *config);
 void _starpu_destroy_topology(struct _starpu_machine_config *config);

+ 219 - 2
src/core/workers.c

@@ -28,6 +28,8 @@
 #include <core/task.h>
 #include <core/task.h>
 #include <profiling/profiling.h>
 #include <profiling/profiling.h>
 #include <starpu_task_list.h>
 #include <starpu_task_list.h>
+#include <drivers/mp_common/sink_common.h>
+#include <drivers/scc/driver_scc_common.h>
 
 
 #include <drivers/cpu/driver_cpu.h>
 #include <drivers/cpu/driver_cpu.h>
 #include <drivers/cuda/driver_cuda.h>
 #include <drivers/cuda/driver_cuda.h>
@@ -51,6 +53,29 @@ static starpu_pthread_key_t worker_key;
 
 
 static struct _starpu_machine_config config;
 static struct _starpu_machine_config config;
 
 
+/* Pointers to argc and argv
+ */
+static int *my_argc = 0;
+static char ***my_argv = NULL;
+
+/* Initialize value of static argc and argv, called when the process begins
+ */
+void _starpu_set_argc_argv(int *argc_param, char ***argv_param)
+{
+	my_argc = argc_param;
+	my_argv = argv_param;
+}
+
+int *_starpu_get_argc()
+{
+	return my_argc;
+}
+
+char ***_starpu_get_argv()
+{
+	return my_argv;
+}
+
 int _starpu_is_initialized(void)
 int _starpu_is_initialized(void)
 {
 {
 	return initialized == INITIALIZED;
 	return initialized == INITIALIZED;
@@ -140,6 +165,16 @@ uint32_t _starpu_worker_exists(struct starpu_task *task)
 	    _starpu_worker_exists_and_can_execute(task, STARPU_OPENCL_WORKER))
 	    _starpu_worker_exists_and_can_execute(task, STARPU_OPENCL_WORKER))
 		return 1;
 		return 1;
 #endif
 #endif
+#ifdef STARPU_USE_MIC
+	if ((task->cl->where & STARPU_MIC) &&
+	    _starpu_worker_exists_and_can_execute(task, STARPU_MIC_WORKER))
+		return 1;
+#endif
+#ifdef STARPU_USE_SCC
+	if ((task->cl->where & STARPU_SCC) &&
+	    _starpu_worker_exists_and_can_execute(task, STARPU_SCC_WORKER))
+		return 1;
+#endif
 	return 0;
 	return 0;
 }
 }
 
 
@@ -158,6 +193,11 @@ uint32_t _starpu_can_submit_opencl_task(void)
 	return (STARPU_OPENCL & config.worker_mask);
 	return (STARPU_OPENCL & config.worker_mask);
 }
 }
 
 
+uint32_t _starpu_can_submit_scc_task(void)
+{
+	return (STARPU_SCC & config.worker_mask);
+}
+
 static int _starpu_can_use_nth_implementation(enum starpu_worker_archtype arch, struct starpu_codelet *cl, unsigned nimpl)
 static int _starpu_can_use_nth_implementation(enum starpu_worker_archtype arch, struct starpu_codelet *cl, unsigned nimpl)
 {
 {
 	switch(arch)
 	switch(arch)
@@ -196,13 +236,26 @@ static int _starpu_can_use_nth_implementation(enum starpu_worker_archtype arch,
 		starpu_opencl_func_t func = _starpu_task_get_opencl_nth_implementation(cl, nimpl);
 		starpu_opencl_func_t func = _starpu_task_get_opencl_nth_implementation(cl, nimpl);
 		return func != NULL;
 		return func != NULL;
 	}
 	}
+	case STARPU_MIC_WORKER:
+	{
+		starpu_mic_func_t func = _starpu_task_get_mic_nth_implementation(cl, nimpl);
+		char *func_name = _starpu_task_get_cpu_name_nth_implementation(cl, nimpl);
+
+		return func != NULL || func_name != NULL;
+	}
+	case STARPU_SCC_WORKER:
+	{
+		starpu_scc_func_t func = _starpu_task_get_scc_nth_implementation(cl, nimpl);
+		char *func_name = _starpu_task_get_cpu_name_nth_implementation(cl, nimpl);
+
+		return func != NULL || func_name != NULL;
+	}
 	default:
 	default:
 		STARPU_ASSERT_MSG(0, "Unknown arch type %d", arch);
 		STARPU_ASSERT_MSG(0, "Unknown arch type %d", arch);
 	}
 	}
 	return 0;
 	return 0;
 }
 }
 
 
-
 int starpu_worker_can_execute_task(unsigned workerid, struct starpu_task *task, unsigned nimpl)
 int starpu_worker_can_execute_task(unsigned workerid, struct starpu_task *task, unsigned nimpl)
 {
 {
 	/* TODO: check that the task operand sizes will fit on that device */
 	/* TODO: check that the task operand sizes will fit on that device */
@@ -255,6 +308,11 @@ int starpu_combined_worker_can_execute_task(unsigned workerid, struct starpu_tas
  * Runtime initialization methods
  * Runtime initialization methods
  */
  */
 
 
+#ifdef STARPU_USE_MIC
+static unsigned mic_initiated[STARPU_MAXMICDEVS];
+static struct _starpu_worker_set mic_worker_set[STARPU_MAXMICDEVS];
+#endif
+
 static void _starpu_init_worker_queue(struct _starpu_worker *workerarg)
 static void _starpu_init_worker_queue(struct _starpu_worker *workerarg)
 {
 {
 	starpu_pthread_cond_t *cond = &workerarg->sched_cond;
 	starpu_pthread_cond_t *cond = &workerarg->sched_cond;
@@ -374,6 +432,9 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 	for (worker = 0; worker < nworkers; worker++)
 	for (worker = 0; worker < nworkers; worker++)
 	{
 	{
 		struct _starpu_worker *workerarg = &pconfig->workers[worker];
 		struct _starpu_worker *workerarg = &pconfig->workers[worker];
+#ifdef STARPU_USE_MIC
+		unsigned mp_nodeid = workerarg->mp_nodeid;
+#endif
 
 
 		workerarg->config = pconfig;
 		workerarg->config = pconfig;
 
 
@@ -503,6 +564,72 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 #endif
 #endif
 				break;
 				break;
 #endif
 #endif
+#ifdef STARPU_USE_MIC
+			case STARPU_MIC_WORKER:
+				/* We use the Gordon approach for the MIC,
+				 * which consists in spawning only one thread
+				 * per MIC device, which will control all MIC
+				 * workers of this device. (by using a worker set). */
+				if (mic_initiated[mp_nodeid])
+					goto worker_set_initialized;
+
+				mic_worker_set[mp_nodeid].nworkers = config->topology.nmiccores[mp_nodeid];
+
+				/* We assume all MIC workers of a given MIC
+				 * device are contiguous so that we can
+				 * address them with the first one only. */
+				mic_worker_set[mp_nodeid].workers = workerarg;
+				mic_worker_set[mp_nodeid].set_is_initialized = 0;
+
+				STARPU_PTHREAD_CREATE(
+						workerarg->name,
+						&mic_worker_set[mp_nodeid].worker_thread,
+						NULL,
+						_starpu_mic_src_worker,
+						&mic_worker_set[mp_nodeid]);
+
+				_STARPU_PTHREAD_MUTEX_LOCK(&mic_worker_set[mp_nodeid].mutex);
+				while (!mic_worker_set[mp_nodeid].set_is_initialized)
+					_STARPU_PTHREAD_COND_WAIT(&mic_worker_set[mp_nodeid].ready_cond,
+								  &mic_worker_set[mp_nodeid].mutex);
+				_STARPU_PTHREAD_MUTEX_UNLOCK(&mic_worker_set[mp_nodeid].mutex);
+
+				mic_initiated[mp_nodeid] = 1;
+
+		worker_set_initialized:
+				workerarg->set = &mic_worker_set[mp_nodeid];
+				mic_worker_set[mp_nodeid].joined = 0;
+				workerarg->worker_is_running = 1;
+
+#ifdef STARPU_USE_FXT
+				STARPU_PTHREAD_MUTEX_LOCK(&workerarg->mutex);
+				while (!workerarg->worker_is_running)
+					STARPU_PTHREAD_COND_WAIT(&workerarg->started_cond, &workerarg->mutex);
+				STARPU_PTHREAD_MUTEX_UNLOCK(&workerarg->mutex);
+#endif
+
+				break;
+#endif /* STARPU_USE_MIC */
+#ifdef STARPU_USE_SCC
+			case STARPU_SCC_WORKER:
+				workerarg->set = NULL;
+				workerarg->worker_is_initialized = 0;
+				STARPU_PTHREAD_CREATE(
+						workerarg->name
+						&workerarg->worker_thread,
+						NULL,
+						_starpu_scc_src_worker,
+						workerarg);
+
+#ifdef STARPU_USE_FXT
+				STARPU_PTHREAD_MUTEX_LOCK(&workerarg->mutex);
+				while (!workerarg->worker_is_running)
+					STARPU_PTHREAD_COND_WAIT(&workerarg->started_cond, &workerarg->mutex);
+				STARPU_PTHREAD_MUTEX_UNLOCK(&workerarg->mutex);
+#endif
+				break;
+#endif
+
 			default:
 			default:
 				STARPU_ABORT();
 				STARPU_ABORT();
 		}
 		}
@@ -598,8 +725,11 @@ int starpu_conf_init(struct starpu_conf *conf)
 		conf->ncpus = starpu_get_env_number("STARPU_NCPUS");
 		conf->ncpus = starpu_get_env_number("STARPU_NCPUS");
 	conf->ncuda = starpu_get_env_number("STARPU_NCUDA");
 	conf->ncuda = starpu_get_env_number("STARPU_NCUDA");
 	conf->nopencl = starpu_get_env_number("STARPU_NOPENCL");
 	conf->nopencl = starpu_get_env_number("STARPU_NOPENCL");
+	conf->nmic = starpu_get_env_number("STARPU_NMIC");
+	conf->nscc = starpu_get_env_number("STARPU_NSCC");
 	conf->calibrate = starpu_get_env_number("STARPU_CALIBRATE");
 	conf->calibrate = starpu_get_env_number("STARPU_CALIBRATE");
 	conf->bus_calibrate = starpu_get_env_number("STARPU_BUS_CALIBRATE");
 	conf->bus_calibrate = starpu_get_env_number("STARPU_BUS_CALIBRATE");
+	conf->mic_sink_program_path = getenv("STARPU_MIC_PROGRAM_PATH");
 
 
 	if (conf->calibrate == -1)
 	if (conf->calibrate == -1)
 	     conf->calibrate = 0;
 	     conf->calibrate = 0;
@@ -610,6 +740,8 @@ int starpu_conf_init(struct starpu_conf *conf)
 	conf->use_explicit_workers_bindid = 0; /* TODO */
 	conf->use_explicit_workers_bindid = 0; /* TODO */
 	conf->use_explicit_workers_cuda_gpuid = 0; /* TODO */
 	conf->use_explicit_workers_cuda_gpuid = 0; /* TODO */
 	conf->use_explicit_workers_opencl_gpuid = 0; /* TODO */
 	conf->use_explicit_workers_opencl_gpuid = 0; /* TODO */
+	conf->use_explicit_workers_mic_deviceid = 0; /* TODO */
+	conf->use_explicit_workers_scc_deviceid = 0; /* TODO */
 
 
 	conf->single_combined_worker = starpu_get_env_number("STARPU_SINGLE_COMBINED_WORKER");
 	conf->single_combined_worker = starpu_get_env_number("STARPU_SINGLE_COMBINED_WORKER");
 	if (conf->single_combined_worker == -1)
 	if (conf->single_combined_worker == -1)
@@ -676,6 +808,32 @@ void _starpu_conf_check_environment(struct starpu_conf *conf)
 
 
 int starpu_init(struct starpu_conf *user_conf)
 int starpu_init(struct starpu_conf *user_conf)
 {
 {
+	return starpu_initialize(user_conf, NULL, NULL);
+}
+
+int starpu_initialize(struct starpu_conf *user_conf, int *argc, char ***argv)
+{
+	int is_a_sink = 0; /* Always defined. If the MP infrastructure is not
+			    * used, we cannot be a sink. */
+#ifdef STARPU_USE_MP
+	_starpu_set_argc_argv(argc, argv);
+
+#	ifdef STARPU_USE_SCC
+	/* In SCC case we look at the rank to know if we are a sink */
+	if (_starpu_scc_common_mp_init() && !_starpu_scc_common_is_src_node())
+		setenv("STARPU_SINK", "STARPU_SCC", 1);
+#	endif
+
+	/* If StarPU was configured to use MP sinks, we have to control the
+	 * kind on node we are running on : host or sink ? */
+	if (getenv("STARPU_SINK"))
+		is_a_sink = 1;
+#else
+	(void)argc;
+	(void)argv;
+
+#endif /* STARPU_USE_MP */
+
 	int ret;
 	int ret;
 
 
 #ifndef STARPU_SIMGRID
 #ifndef STARPU_SIMGRID
@@ -783,11 +941,17 @@ int starpu_init(struct starpu_conf *user_conf)
 
 
 	_starpu_load_bus_performance_files();
 	_starpu_load_bus_performance_files();
 
 
-	ret = _starpu_build_topology(&config);
+	/* Depending on whether we are a MP sink or not, we must build the
+	 * topology with MP nodes or not. */
+	ret = _starpu_build_topology(&config, is_a_sink ? 1 : 0);
 	if (ret)
 	if (ret)
 	{
 	{
 		STARPU_PTHREAD_MUTEX_LOCK(&init_mutex);
 		STARPU_PTHREAD_MUTEX_LOCK(&init_mutex);
 		init_count--;
 		init_count--;
+#ifdef STARPU_USE_SCC
+		if (_starpu_scc_common_is_mp_initialized())
+			_starpu_scc_src_mp_deinit();
+#endif
 		initialized = UNINITIALIZED;
 		initialized = UNINITIALIZED;
 		/* Let somebody else try to do it */
 		/* Let somebody else try to do it */
 		STARPU_PTHREAD_COND_SIGNAL(&init_cond);
 		STARPU_PTHREAD_COND_SIGNAL(&init_cond);
@@ -799,6 +963,14 @@ int starpu_init(struct starpu_conf *user_conf)
 	 * threads */
 	 * threads */
 	_starpu_initialize_current_task_key();
 	_starpu_initialize_current_task_key();
 
 
+	/* Theorically, MP sinks should not have to initialize the scheduling
+	 * policy: indeed, they do not have their own one but are under the
+	 * order of the MP source's one.
+	 *
+	 * For unkown reasons to me (excluded the fact that this software is
+	 * ununderstandable for normally-formed human brains...), skipping
+	 * this step makes _starpu_launch_drivers() hangs.
+	 */
 	_starpu_create_sched_ctx(config.conf->sched_policy_name, NULL, -1, 1, "init");
 	_starpu_create_sched_ctx(config.conf->sched_policy_name, NULL, -1, 1, "init");
 
 
 	_starpu_initialize_registered_performance_models();
 	_starpu_initialize_registered_performance_models();
@@ -813,6 +985,20 @@ int starpu_init(struct starpu_conf *user_conf)
 	STARPU_PTHREAD_MUTEX_UNLOCK(&init_mutex);
 	STARPU_PTHREAD_MUTEX_UNLOCK(&init_mutex);
 
 
 	_STARPU_DEBUG("Initialisation finished\n");
 	_STARPU_DEBUG("Initialisation finished\n");
+
+#ifdef STARPU_USE_MP
+	/* Finally, if we are a MP sink, we never leave this function. Else,
+	 * we enter an infinite event loop which listen for MP commands from
+	 * the source. */
+	if (is_a_sink) {
+		_starpu_sink_common_worker();
+
+		/* We should normally never leave the loop as we don't want to
+		 * really initialize STARPU */
+		STARPU_ASSERT(0);
+	}
+#endif
+
 	return 0;
 	return 0;
 }
 }
 
 
@@ -1012,6 +1198,11 @@ void starpu_shutdown(void)
 	if (AYU_event) AYU_event(AYU_FINISH, 0, NULL);
 	if (AYU_event) AYU_event(AYU_FINISH, 0, NULL);
 #endif
 #endif
 
 
+#ifdef STARPU_USE_SCC
+	if (_starpu_scc_common_is_mp_initialized())
+		_starpu_scc_src_mp_deinit();
+#endif
+
 	_STARPU_DEBUG("Shutdown finished\n");
 	_STARPU_DEBUG("Shutdown finished\n");
 }
 }
 
 
@@ -1033,6 +1224,12 @@ int starpu_worker_get_count_by_type(enum starpu_worker_archtype type)
 		case STARPU_OPENCL_WORKER:
 		case STARPU_OPENCL_WORKER:
 			return config.topology.nopenclgpus;
 			return config.topology.nopenclgpus;
 
 
+		case STARPU_MIC_WORKER:
+			return config.topology.nmicdevices;
+
+		case STARPU_SCC_WORKER:
+			return config.topology.nsccdevices;
+
 		default:
 		default:
 			return -EINVAL;
 			return -EINVAL;
 	}
 	}
@@ -1073,6 +1270,21 @@ int starpu_asynchronous_opencl_copy_disabled(void)
 	return config.conf->disable_asynchronous_opencl_copy;
 	return config.conf->disable_asynchronous_opencl_copy;
 }
 }
 
 
+unsigned starpu_mic_worker_get_count(void)
+{
+	int i = 0, count = 0;
+	
+	for (i = 0; i < STARPU_MAXMICDEVS; i++)
+		count += config.topology.nmiccores[i];
+	
+	return count;
+}
+
+unsigned starpu_scc_worker_get_count(void)
+{
+	return config.topology.nsccdevices;
+}
+
 /* When analyzing performance, it is useful to see what is the processing unit
 /* When analyzing performance, it is useful to see what is the processing unit
  * that actually performed the task. This function returns the id of the
  * that actually performed the task. This function returns the id of the
  * processing unit actually executing it, therefore it makes no sense to use it
  * processing unit actually executing it, therefore it makes no sense to use it
@@ -1146,6 +1358,11 @@ int starpu_combined_worker_get_rank(void)
 	}
 	}
 }
 }
 
 
+int starpu_worker_get_mp_nodeid(int id)
+{
+	return config.workers[id].mp_nodeid;
+}
+
 int starpu_worker_get_devid(int id)
 int starpu_worker_get_devid(int id)
 {
 {
 	return config.workers[id].devid;
 	return config.workers[id].devid;

+ 14 - 0
src/core/workers.h

@@ -37,6 +37,15 @@
 #include <drivers/cuda/driver_cuda.h>
 #include <drivers/cuda/driver_cuda.h>
 #include <drivers/opencl/driver_opencl.h>
 #include <drivers/opencl/driver_opencl.h>
 
 
+#ifdef STARPU_USE_MIC
+#include <drivers/mic/driver_mic_source.h>
+#endif /* STARPU_USE_MIC */
+
+#ifdef STARPU_USE_SCC
+#include <drivers/scc/driver_scc_source.h>
+#endif
+
+
 #include <drivers/cpu/driver_cpu.h>
 #include <drivers/cpu/driver_cpu.h>
 
 
 #include <datawizard/datawizard.h>
 #include <datawizard/datawizard.h>
@@ -51,6 +60,8 @@ struct _starpu_worker
 	uint32_t worker_mask; /* what is the type of worker ? */
 	uint32_t worker_mask; /* what is the type of worker ? */
 	enum starpu_perfmodel_archtype perf_arch; /* in case there are different models of the same arch */
 	enum starpu_perfmodel_archtype perf_arch; /* in case there are different models of the same arch */
 	starpu_pthread_t worker_thread; /* the thread which runs the worker */
 	starpu_pthread_t worker_thread; /* the thread which runs the worker */
+	int mp_nodeid; /* which mp node hold the cpu/gpu/etc (-1 for this
+			* node) */
 	unsigned devid; /* which cpu/gpu/etc is controlled by the worker ? */
 	unsigned devid; /* which cpu/gpu/etc is controlled by the worker ? */
 	int bindid; /* which cpu is the driver bound to ? (logical index) */
 	int bindid; /* which cpu is the driver bound to ? (logical index) */
 	int workerid; /* uniquely identify the worker among all processing units types */
 	int workerid; /* uniquely identify the worker among all processing units types */
@@ -199,6 +210,9 @@ uint32_t _starpu_can_submit_cpu_task(void);
 /* Is there a worker that can execute OpenCL code ? */
 /* Is there a worker that can execute OpenCL code ? */
 uint32_t _starpu_can_submit_opencl_task(void);
 uint32_t _starpu_can_submit_opencl_task(void);
 
 
+/* Is there a worker that can execute OpenCL code ? */
+uint32_t _starpu_can_submit_scc_task(void);
+
 /* Check whether there is anything that the worker should do instead of
 /* Check whether there is anything that the worker should do instead of
  * sleeping (waiting on something to happen). */
  * sleeping (waiting on something to happen). */
 unsigned _starpu_worker_can_block(unsigned memnode);
 unsigned _starpu_worker_can_block(unsigned memnode);

+ 5 - 0
src/datawizard/coherency.c

@@ -180,6 +180,11 @@ static int worker_supports_direct_access(unsigned node, unsigned handling_node)
 #endif
 #endif
 		case STARPU_OPENCL_RAM:
 		case STARPU_OPENCL_RAM:
 			return 0;
 			return 0;
+		case STARPU_MIC_RAM:
+			/* We don't handle direct MIC-MIC transfers yet */
+			return 0;
+		case STARPU_SCC_RAM:
+			return 1;
 		default:
 		default:
 			return 1;
 			return 1;
 	}
 	}

+ 138 - 0
src/datawizard/copy_driver.c

@@ -320,6 +320,93 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 		}
 		}
 		break;
 		break;
 #endif
 #endif
+#ifdef STARPU_USE_MIC
+	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_MIC_RAM):
+		/* RAM -> MIC */
+#	ifdef STARPU_MIC_USE_RMA
+		if (!req || starpu_asynchronous_copy_disabled() ||
+				!(copy_methods->ram_to_mic_async || copy_methods->any_to_any))
+		{
+			/* this is not associated to a request so it's synchronous */
+			STARPU_ASSERT(copy_methods->ram_to_mic || copy_methods->any_to_any);
+			if (copy_methods->ram_to_mic)
+				copy_methods->ram_to_mic(src_interface, src_node, dst_interface, dst_node);
+			else
+				copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
+		}
+		else
+		{
+			req->async_channel.type = STARPU_MIC_RAM;
+			if (copy_methods->ram_to_mic_async)
+				ret = copy_methods->ram_to_mic_async(src_interface, src_node, dst_interface, dst_node);
+			else
+			{
+				STARPU_ASSERT(copy_methods->any_to_any);
+				ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
+			}
+			_starpu_mic_init_event(&(req->async_channel.event.mic_event), dst_node);
+		}
+		break;
+#	else
+		copy_methods->ram_to_mic(src_interface, src_node, dst_interface, dst_node);
+		break;
+#	endif
+	case _STARPU_MEMORY_NODE_TUPLE(STARPU_MIC_RAM,STARPU_CPU_RAM):
+		/* MIC -> RAM */
+#	ifdef STARPU_MIC_USE_RMA
+		if (!req || starpu_asynchronous_copy_disabled() ||
+				!(copy_methods->mic_to_ram_async || copy_methods->any_to_any))
+		{
+			/* this is not associated to a request so it's synchronous */
+			STARPU_ASSERT(copy_methods->mic_to_ram || copy_methods->any_to_any);
+			if (copy_methods->mic_to_ram)
+				copy_methods->mic_to_ram(src_interface, src_node, dst_interface, dst_node);
+			else
+				copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
+		}
+		else
+		{
+			req->async_channel.type = STARPU_MIC_RAM;
+			if (copy_methods->mic_to_ram_async)
+				ret = copy_methods->mic_to_ram_async(src_interface, src_node, dst_interface, dst_node);
+			else
+			{
+				STARPU_ASSERT(copy_methods->any_to_any);
+				ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
+			}
+			_starpu_mic_init_event(&(req->async_channel.event.mic_event), src_node);
+		}
+		break;
+#	else
+		copy_methods->mic_to_ram(src_interface, src_node, dst_interface, dst_node);
+		break;
+#	endif
+#endif
+#ifdef STARPU_USE_SCC
+		/* SCC RAM associated to the master process is considered as
+		 * the main memory node. */
+	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_SCC_RAM):
+		/* master private SCC RAM -> slave private SCC RAM */
+		if (copy_methods->scc_src_to_sink)
+			copy_methods->scc_src_to_sink(src_interface, src_node, dst_interface, dst_node);
+		else
+			copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
+		break;
+	case _STARPU_MEMORY_NODE_TUPLE(STARPU_SCC_RAM,STARPU_CPU_RAM):
+		/* slave private SCC RAM -> master private SCC RAM */
+		if (copy_methods->scc_sink_to_src)
+			copy_methods->scc_sink_to_src(src_interface, src_node, dst_interface, dst_node);
+		else
+			copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
+		break;
+	case _STARPU_MEMORY_NODE_TUPLE(STARPU_SCC_RAM,STARPU_SCC_RAM):
+		/* slave private SCC RAM -> slave private SCC RAM */
+		if (copy_methods->scc_sink_to_sink)
+			copy_methods->scc_sink_to_sink(src_interface, src_node, dst_interface, dst_node);
+		else
+			copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
+		break;
+#endif
 	default:
 	default:
 		STARPU_ABORT();
 		STARPU_ABORT();
 		break;
 		break;
@@ -436,6 +523,47 @@ int starpu_interface_copy(uintptr_t src, size_t src_offset, unsigned src_node, u
 				size,
 				size,
 				&async_channel->event.opencl_event);
 				&async_channel->event.opencl_event);
 #endif
 #endif
+#ifdef STARPU_USE_MIC
+	case _STARPU_MEMORY_NODE_TUPLE(STARPU_MIC_RAM,STARPU_CPU_RAM):
+		if (async_data)
+			return _starpu_mic_copy_ram_to_mic_async(
+					(void*) src + src_offset, src_node,
+					(void*) dst + dst_offset, dst_node,
+					size);
+		else
+			return _starpu_mic_copy_ram_to_mic(
+					(void*) src + src_offset, src_node,
+					(void*) dst + dst_offset, dst_node,
+					size);
+	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_MIC_RAM):
+		if (async_data)
+			return _starpu_mic_copy_mic_to_ram_async(
+					(void*) src + src_offset, src_node,
+					(void*) dst + dst_offset, dst_node,
+					size);
+		else
+			return _starpu_mic_copy_mic_to_ram(
+					(void*) src + src_offset, src_node,
+					(void*) dst + dst_offset, dst_node,
+					size);
+#endif
+#ifdef STARPU_USE_SCC
+	case _STARPU_MEMORY_NODE_TUPLE(STARPU_SCC_RAM,STARPU_CPU_RAM):
+		_starpu_scc_copy_src_to_sink(
+				(void*) src + src_offset, src_node,
+				(void*) dst + dst_offset, dst_node,
+				size);
+	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_SCC_RAM):
+		_starpu_scc_copy_sink_to_src(
+				(void*) src + src_offset, src_node,
+				(void*) dst + dst_offset, dst_node,
+				size);
+	case _STARPU_MEMORY_NODE_TUPLE(STARPU_SCC_RAM,STARPU_SCC_RAM):
+		_starpu_scc_copy_sink_to_sink(
+				(void*) src + src_offset, src_node,
+				(void*) dst + dst_offset, dst_node,
+				size);
+#endif
 	default:
 	default:
 		STARPU_ABORT();
 		STARPU_ABORT();
 		return -1;
 		return -1;
@@ -488,6 +616,11 @@ void _starpu_driver_wait_request_completion(struct _starpu_async_channel *async_
 	      break;
 	      break;
 	}
 	}
 #endif
 #endif
+#ifdef STARPU_USE_MIC
+	case STARPU_MIC_RAM:
+		_starpu_mic_wait_request_completion(&(async_channel->event.mic_event));
+		break;
+#endif
 	case STARPU_CPU_RAM:
 	case STARPU_CPU_RAM:
 	default:
 	default:
 		STARPU_ABORT();
 		STARPU_ABORT();
@@ -539,6 +672,11 @@ unsigned _starpu_driver_test_request_completion(struct _starpu_async_channel *as
 		break;
 		break;
 	}
 	}
 #endif
 #endif
+#ifdef STARPU_USE_MIC
+	case STARPU_MIC_RAM:
+		success = _starpu_mic_request_is_complete(&(async_channel->event.mic_event));
+		break;
+#endif
 	case STARPU_CPU_RAM:
 	case STARPU_CPU_RAM:
 	default:
 	default:
 		STARPU_ABORT();
 		STARPU_ABORT();

+ 15 - 0
src/datawizard/copy_driver.h

@@ -36,6 +36,18 @@
 struct _starpu_data_request;
 struct _starpu_data_request;
 struct _starpu_data_replicate;
 struct _starpu_data_replicate;
 
 
+#ifdef STARPU_USE_MIC
+/* MIC need memory_node to now which MIC is concerned.
+ * mark is used to wait asynchronous request.
+ * signal is used to test asynchronous request. */
+struct _starpu_mic_async_event
+{
+	unsigned memory_node;
+	int mark;
+	uint64_t *signal;
+};
+#endif
+
 /* this is a structure that can be queried to see whether an asynchronous
 /* this is a structure that can be queried to see whether an asynchronous
  * transfer has terminated or not */
  * transfer has terminated or not */
 union _starpu_async_channel_event
 union _starpu_async_channel_event
@@ -54,6 +66,9 @@ union _starpu_async_channel_event
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
         cl_event opencl_event;
         cl_event opencl_event;
 #endif
 #endif
+#ifdef STARPU_USE_MIC
+	struct _starpu_mic_async_event mic_event;
+#endif
 };
 };
 
 
 struct _starpu_async_channel
 struct _starpu_async_channel

+ 1 - 0
src/datawizard/interfaces/bcsr_filters.c

@@ -35,6 +35,7 @@ void starpu_bcsr_filter_canonical_block(void *father_interface, void *child_inte
 
 
 	uint32_t ptr_offset = c*r*id*elemsize;
 	uint32_t ptr_offset = c*r*id*elemsize;
 
 
+	matrix_child->id = STARPU_MATRIX_INTERFACE_ID;
 	matrix_child->nx = c;
 	matrix_child->nx = c;
 	matrix_child->ny = r;
 	matrix_child->ny = r;
 	matrix_child->ld = c;
 	matrix_child->ld = c;

+ 4 - 2
src/datawizard/interfaces/bcsr_interface.c

@@ -46,7 +46,7 @@ static int bcsr_compare(void *data_interface_a, void *data_interface_b);
 static uint32_t footprint_bcsr_interface_crc32(starpu_data_handle_t handle);
 static uint32_t footprint_bcsr_interface_crc32(starpu_data_handle_t handle);
 
 
 
 
-static struct starpu_data_interface_ops interface_bcsr_ops =
+struct starpu_data_interface_ops starpu_interface_bcsr_ops =
 {
 {
 	.register_data_handle = register_bcsr_handle,
 	.register_data_handle = register_bcsr_handle,
 	.allocate_data_on_node = allocate_bcsr_buffer_on_node,
 	.allocate_data_on_node = allocate_bcsr_buffer_on_node,
@@ -82,6 +82,7 @@ static void register_bcsr_handle(starpu_data_handle_t handle, unsigned home_node
 			local_interface->rowptr = NULL;
 			local_interface->rowptr = NULL;
 		}
 		}
 
 
+		local_interface->id = bcsr_interface->id;
 		local_interface->nnz = bcsr_interface->nnz;
 		local_interface->nnz = bcsr_interface->nnz;
 		local_interface->nrow = bcsr_interface->nrow;
 		local_interface->nrow = bcsr_interface->nrow;
 		local_interface->firstentry = bcsr_interface->firstentry;
 		local_interface->firstentry = bcsr_interface->firstentry;
@@ -98,6 +99,7 @@ void starpu_bcsr_data_register(starpu_data_handle_t *handleptr, unsigned home_no
 {
 {
 	struct starpu_bcsr_interface bcsr_interface =
 	struct starpu_bcsr_interface bcsr_interface =
 	{
 	{
+		.id = STARPU_BCSR_INTERFACE_ID,
 		.nzval = nzval,
 		.nzval = nzval,
 		.colind = colind,
 		.colind = colind,
 		.rowptr = rowptr,
 		.rowptr = rowptr,
@@ -109,7 +111,7 @@ void starpu_bcsr_data_register(starpu_data_handle_t *handleptr, unsigned home_no
 		.elemsize = elemsize
 		.elemsize = elemsize
 	};
 	};
 
 
-	starpu_data_register(handleptr, home_node, &bcsr_interface, &interface_bcsr_ops);
+	starpu_data_register(handleptr, home_node, &bcsr_interface, &starpu_interface_bcsr_ops);
 }
 }
 
 
 static uint32_t footprint_bcsr_interface_crc32(starpu_data_handle_t handle)
 static uint32_t footprint_bcsr_interface_crc32(starpu_data_handle_t handle)

+ 1 - 0
src/datawizard/interfaces/block_filters.c

@@ -37,6 +37,7 @@ void starpu_block_filter_block(void *father_interface, void *child_interface, ST
 	_starpu_filter_nparts_compute_chunk_size_and_offset(nx, nparts, elemsize, id, 1,
 	_starpu_filter_nparts_compute_chunk_size_and_offset(nx, nparts, elemsize, id, 1,
 				       &chunk_size, &offset);
 				       &chunk_size, &offset);
 
 
+	block_child->id = block_father->id;
 	block_child->nx = chunk_size;
 	block_child->nx = chunk_size;
 	block_child->ny = ny;
 	block_child->ny = ny;
 	block_child->nz = nz;
 	block_child->nz = nz;

+ 186 - 2
src/datawizard/interfaces/block_interface.c

@@ -26,6 +26,8 @@
 #include <starpu_cuda.h>
 #include <starpu_cuda.h>
 #include <starpu_opencl.h>
 #include <starpu_opencl.h>
 #include <drivers/opencl/driver_opencl.h>
 #include <drivers/opencl/driver_opencl.h>
+#include <drivers/scc/driver_scc_source.h>
+#include <drivers/mic/driver_mic_source.h>
 
 
 static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
 static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
@@ -43,6 +45,17 @@ static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARP
 static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cl_event *event);
 static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cl_event *event);
 static int copy_opencl_to_opencl_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cl_event *event);
 static int copy_opencl_to_opencl_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cl_event *event);
 #endif
 #endif
+#ifdef STARPU_USE_SCC
+static int copy_scc_src_to_sink(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
+static int copy_scc_sink_to_src(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
+static int copy_scc_sink_to_sink(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
+#endif
+#ifdef STARPU_USE_MIC
+static int copy_ram_to_mic(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
+static int copy_mic_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
+static int copy_ram_to_mic_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
+static int copy_mic_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
+#endif
 
 
 static const struct starpu_data_copy_methods block_copy_data_methods_s =
 static const struct starpu_data_copy_methods block_copy_data_methods_s =
 {
 {
@@ -74,7 +87,7 @@ static uint32_t footprint_block_interface_crc32(starpu_data_handle_t handle);
 static int block_compare(void *data_interface_a, void *data_interface_b);
 static int block_compare(void *data_interface_a, void *data_interface_b);
 static void display_block_interface(starpu_data_handle_t handle, FILE *f);
 static void display_block_interface(starpu_data_handle_t handle, FILE *f);
 
 
-static struct starpu_data_interface_ops interface_block_ops =
+struct starpu_data_interface_ops starpu_interface_block_ops =
 {
 {
 	.register_data_handle = register_block_handle,
 	.register_data_handle = register_block_handle,
 	.allocate_data_on_node = allocate_block_buffer_on_node,
 	.allocate_data_on_node = allocate_block_buffer_on_node,
@@ -126,6 +139,7 @@ static void register_block_handle(starpu_data_handle_t handle, unsigned home_nod
 			local_interface->ldz  = 0;
 			local_interface->ldz  = 0;
 		}
 		}
 
 
+		local_interface->id = block_interface->id;
 		local_interface->nx = block_interface->nx;
 		local_interface->nx = block_interface->nx;
 		local_interface->ny = block_interface->ny;
 		local_interface->ny = block_interface->ny;
 		local_interface->nz = block_interface->nz;
 		local_interface->nz = block_interface->nz;
@@ -140,6 +154,7 @@ void starpu_block_data_register(starpu_data_handle_t *handleptr, unsigned home_n
 {
 {
 	struct starpu_block_interface block_interface =
 	struct starpu_block_interface block_interface =
 	{
 	{
+		.id = STARPU_BLOCK_INTERFACE_ID,
 		.ptr = ptr,
 		.ptr = ptr,
                 .dev_handle = ptr,
                 .dev_handle = ptr,
                 .offset = 0,
                 .offset = 0,
@@ -151,7 +166,12 @@ void starpu_block_data_register(starpu_data_handle_t *handleptr, unsigned home_n
 		.elemsize = elemsize
 		.elemsize = elemsize
 	};
 	};
 
 
-	starpu_data_register(handleptr, home_node, &block_interface, &interface_block_ops);
+#ifdef STARPU_USE_SCC
+	_starpu_scc_set_offset_in_shared_memory((void*)block_interface.ptr,
+			(void**)&(block_interface.dev_handle), &(block_interface.offset));
+#endif
+
+	starpu_data_register(handleptr, home_node, &block_interface, &starpu_interface_block_ops);
 }
 }
 
 
 static uint32_t footprint_block_interface_crc32(starpu_data_handle_t handle)
 static uint32_t footprint_block_interface_crc32(starpu_data_handle_t handle)
@@ -584,6 +604,170 @@ static int copy_opencl_to_opencl(void *src_interface, unsigned src_node STARPU_A
 
 
 #endif
 #endif
 
 
+#ifdef STARPU_USE_SCC
+static int copy_scc_src_to_sink(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
+{
+	uint32_t nx = STARPU_BLOCK_GET_NX(dst_interface);
+	uint32_t ny = STARPU_BLOCK_GET_NY(dst_interface);
+	uint32_t nz = STARPU_BLOCK_GET_NZ(dst_interface);
+
+	size_t elemsize = STARPU_BLOCK_GET_ELEMSIZE(dst_interface);
+
+	uint32_t src_ldy = STARPU_BLOCK_GET_LDY(src_interface);
+	uint32_t src_ldz = STARPU_BLOCK_GET_LDZ(src_interface);
+	uint32_t dst_ldy = STARPU_BLOCK_GET_LDY(dst_interface);
+	uint32_t dst_ldz = STARPU_BLOCK_GET_LDZ(dst_interface);
+
+	void *src_ptr = (void *)STARPU_BLOCK_GET_PTR(src_interface);
+	void *dst_ptr = (void *)STARPU_BLOCK_GET_PTR(dst_interface);
+
+	unsigned y, z;
+	for (z = 0; z < nz; ++z)
+	{
+		for (y = 0; y < ny; ++y)
+		{
+			uint32_t src_offset = (y*src_ldy + z*src_ldz) * elemsize;
+			uint32_t dst_offset = (y*dst_ldy + z*dst_ldz) * elemsize;
+
+			_starpu_scc_copy_src_to_sink(src_ptr + src_offset, src_node,
+							dst_ptr + dst_offset, dst_node, nx*elemsize);
+		}
+	}
+
+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, nx*ny*nz*elemsize);
+
+	return 0;
+}
+
+static int copy_scc_sink_to_src(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
+{
+	uint32_t nx = STARPU_BLOCK_GET_NX(dst_interface);
+	uint32_t ny = STARPU_BLOCK_GET_NY(dst_interface);
+	uint32_t nz = STARPU_BLOCK_GET_NZ(dst_interface);
+
+	size_t elemsize = STARPU_BLOCK_GET_ELEMSIZE(dst_interface);
+
+	uint32_t src_ldy = STARPU_BLOCK_GET_LDY(src_interface);
+	uint32_t src_ldz = STARPU_BLOCK_GET_LDZ(src_interface);
+	uint32_t dst_ldy = STARPU_BLOCK_GET_LDY(dst_interface);
+	uint32_t dst_ldz = STARPU_BLOCK_GET_LDZ(dst_interface);
+
+	void *src_ptr = (void *)STARPU_BLOCK_GET_PTR(src_interface);
+	void *dst_ptr = (void *)STARPU_BLOCK_GET_PTR(dst_interface);
+
+	unsigned y, z;
+	for (z = 0; z < nz; ++z)
+	{
+		for (y = 0; y < ny; ++y)
+		{
+			uint32_t src_offset = (y*src_ldy + z*src_ldz) * elemsize;
+			uint32_t dst_offset = (y*dst_ldy + z*dst_ldz) * elemsize;
+
+			_starpu_scc_copy_sink_to_src(src_ptr + src_offset, src_node,
+							dst_ptr + dst_offset, dst_node, nx*elemsize);
+		}
+	}
+
+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, nx*ny*nz*elemsize);
+
+	return 0;
+}
+
+static int copy_scc_sink_to_sink(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
+{
+	uint32_t nx = STARPU_BLOCK_GET_NX(dst_interface);
+	uint32_t ny = STARPU_BLOCK_GET_NY(dst_interface);
+	uint32_t nz = STARPU_BLOCK_GET_NZ(dst_interface);
+
+	size_t elemsize = STARPU_BLOCK_GET_ELEMSIZE(dst_interface);
+
+	uint32_t src_ldy = STARPU_BLOCK_GET_LDY(src_interface);
+	uint32_t src_ldz = STARPU_BLOCK_GET_LDZ(src_interface);
+	uint32_t dst_ldy = STARPU_BLOCK_GET_LDY(dst_interface);
+	uint32_t dst_ldz = STARPU_BLOCK_GET_LDZ(dst_interface);
+
+	void *src_ptr = (void *)STARPU_BLOCK_GET_PTR(src_interface);
+	void *dst_ptr = (void *)STARPU_BLOCK_GET_PTR(dst_interface);
+
+	unsigned y, z;
+	for (z = 0; z < nz; ++z)
+	{
+		for (y = 0; y < ny; ++y)
+		{
+			uint32_t src_offset = (y*src_ldy + z*src_ldz) * elemsize;
+			uint32_t dst_offset = (y*dst_ldy + z*dst_ldz) * elemsize;
+
+			_starpu_scc_copy_sink_to_sink(src_ptr + src_offset, src_node,
+					dst_ptr + dst_offset, dst_node, nx*elemsize);
+		}
+	}
+
+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, nx*ny*nz*elemsize);
+
+	return 0;
+}
+#endif /* STARPU_USE_SCC */
+
+#ifdef STARPU_USE_MIC
+static int copy_mic_common(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node,
+						   int (*copy_func)(void *, unsigned, void *, unsigned, size_t))
+{
+	struct starpu_block_interface *src_block = src_interface;
+	struct starpu_block_interface *dst_block = dst_interface;
+	
+	uint32_t nx = dst_block->nx;
+	uint32_t ny = dst_block->ny;
+	uint32_t nz = dst_block->nz;
+	size_t elemsize = dst_block->elemsize;
+
+	uint32_t ldy_src = src_block->ldy;
+	uint32_t ldz_src = src_block->ldz;
+	uint32_t ldy_dst = dst_block->ldy;
+	uint32_t ldz_dst = dst_block->ldz;
+
+	uintptr_t ptr_src = src_block->ptr;
+	uintptr_t ptr_dst = dst_block->ptr;
+
+	unsigned y, z;
+	for (z = 0; z < nz; z++)
+	{
+		for (y = 0; y < ny; y++)
+		{
+			uint32_t src_offset = (y*ldy_src + z*ldz_src)*elemsize;
+			uint32_t dst_offset = (y*ldy_dst + z*ldz_dst)*elemsize;
+
+			copy_func((void *)(ptr_src + src_offset), src_node, (void *)(ptr_dst + dst_offset), dst_node, nx*elemsize);
+		}
+	}
+
+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, nx*ny*nz*elemsize);
+
+	return 0;
+
+}
+static int copy_ram_to_mic(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
+{
+	return copy_mic_common(src_interface, src_node, dst_interface, dst_node, _starpu_mic_copy_ram_to_mic);
+}
+
+static int copy_mic_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
+{
+	return copy_mic_common(src_interface, src_node, dst_interface, dst_node, _starpu_mic_copy_mic_to_ram);
+}
+
+static int copy_ram_to_mic_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
+{
+	copy_mic_common(src_interface, src_node, dst_interface, dst_node, _starpu_mic_copy_ram_to_mic_async);
+	return -EAGAIN;
+}
+
+static int copy_mic_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
+{
+	copy_mic_common(src_interface, src_node, dst_interface, dst_node, _starpu_mic_copy_mic_to_ram_async);
+	return -EAGAIN;
+}
+#endif
+
 /* as not all platform easily have a BLAS lib installed ... */
 /* as not all platform easily have a BLAS lib installed ... */
 static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
 static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
 {
 {

+ 4 - 2
src/datawizard/interfaces/coo_interface.c

@@ -89,6 +89,7 @@ register_coo_handle(starpu_data_handle_t handle, unsigned home_node,
 			local_interface->rows = 0;
 			local_interface->rows = 0;
 		}
 		}
 
 
+		local_interface->id = coo_interface->id;
 		local_interface->nx = coo_interface->nx;
 		local_interface->nx = coo_interface->nx;
 		local_interface->ny = coo_interface->ny;
 		local_interface->ny = coo_interface->ny;
 		local_interface->n_values = coo_interface->n_values;
 		local_interface->n_values = coo_interface->n_values;
@@ -189,7 +190,7 @@ display_coo_interface(starpu_data_handle_t handle, FILE *f)
 	fprintf(f, "%u\t%u", coo_interface->nx, coo_interface->ny);
 	fprintf(f, "%u\t%u", coo_interface->nx, coo_interface->ny);
 }
 }
 
 
-struct starpu_data_interface_ops _starpu_interface_coo_ops =
+struct starpu_data_interface_ops starpu_interface_coo_ops =
 {
 {
 	.register_data_handle  = register_coo_handle,
 	.register_data_handle  = register_coo_handle,
 	.allocate_data_on_node = allocate_coo_buffer_on_node,
 	.allocate_data_on_node = allocate_coo_buffer_on_node,
@@ -212,6 +213,7 @@ starpu_coo_data_register(starpu_data_handle_t *handleptr, unsigned home_node,
 {
 {
 	struct starpu_coo_interface coo_interface =
 	struct starpu_coo_interface coo_interface =
 	{
 	{
+		.id = STARPU_COO_INTERFACE_ID,
 		.values = values,
 		.values = values,
 		.columns = columns,
 		.columns = columns,
 		.rows = rows,
 		.rows = rows,
@@ -222,5 +224,5 @@ starpu_coo_data_register(starpu_data_handle_t *handleptr, unsigned home_node,
 	};
 	};
 
 
 	starpu_data_register(handleptr, home_node, &coo_interface,
 	starpu_data_register(handleptr, home_node, &coo_interface,
-			     &_starpu_interface_coo_ops);
+			     &starpu_interface_coo_ops);
 }
 }

+ 1 - 0
src/datawizard/interfaces/csr_filters.c

@@ -46,6 +46,7 @@ void starpu_csr_filter_vertical_block(void *father_interface, void *child_interf
 
 
 	uint32_t local_nnz = rowptr[first_index + child_nrow] - rowptr[first_index];
 	uint32_t local_nnz = rowptr[first_index + child_nrow] - rowptr[first_index];
 
 
+	csr_child->id = csr_father->id;
 	csr_child->nnz = local_nnz;
 	csr_child->nnz = local_nnz;
 	csr_child->nrow = child_nrow;
 	csr_child->nrow = child_nrow;
 	csr_child->firstentry = local_firstentry;
 	csr_child->firstentry = local_firstentry;

+ 6 - 2
src/datawizard/interfaces/csr_interface.c

@@ -27,6 +27,8 @@
 #include <starpu_cuda.h>
 #include <starpu_cuda.h>
 #include <starpu_opencl.h>
 #include <starpu_opencl.h>
 #include <drivers/opencl/driver_opencl.h>
 #include <drivers/opencl/driver_opencl.h>
+#include <drivers/scc/driver_scc_source.h>
+#include <drivers/mic/driver_mic_source.h>
 
 
 static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
 static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
 
 
@@ -42,7 +44,7 @@ static size_t csr_interface_get_size(starpu_data_handle_t handle);
 static int csr_compare(void *data_interface_a, void *data_interface_b);
 static int csr_compare(void *data_interface_a, void *data_interface_b);
 static uint32_t footprint_csr_interface_crc32(starpu_data_handle_t handle);
 static uint32_t footprint_csr_interface_crc32(starpu_data_handle_t handle);
 
 
-static struct starpu_data_interface_ops interface_csr_ops =
+struct starpu_data_interface_ops starpu_interface_csr_ops =
 {
 {
 	.register_data_handle = register_csr_handle,
 	.register_data_handle = register_csr_handle,
 	.allocate_data_on_node = allocate_csr_buffer_on_node,
 	.allocate_data_on_node = allocate_csr_buffer_on_node,
@@ -76,6 +78,7 @@ static void register_csr_handle(starpu_data_handle_t handle, unsigned home_node,
 			local_interface->colind = NULL;
 			local_interface->colind = NULL;
 		}
 		}
 
 
+		local_interface->id = csr_interface->id;
 		local_interface->rowptr = csr_interface->rowptr;
 		local_interface->rowptr = csr_interface->rowptr;
 		local_interface->nnz = csr_interface->nnz;
 		local_interface->nnz = csr_interface->nnz;
 		local_interface->nrow = csr_interface->nrow;
 		local_interface->nrow = csr_interface->nrow;
@@ -91,6 +94,7 @@ void starpu_csr_data_register(starpu_data_handle_t *handleptr, unsigned home_nod
 {
 {
 	struct starpu_csr_interface csr_interface =
 	struct starpu_csr_interface csr_interface =
 	{
 	{
+		.id = STARPU_CSR_INTERFACE_ID,
 		.nnz = nnz,
 		.nnz = nnz,
 		.nrow = nrow,
 		.nrow = nrow,
 		.nzval = nzval,
 		.nzval = nzval,
@@ -100,7 +104,7 @@ void starpu_csr_data_register(starpu_data_handle_t *handleptr, unsigned home_nod
 		.elemsize = elemsize
 		.elemsize = elemsize
 	};
 	};
 
 
-	starpu_data_register(handleptr, home_node, &csr_interface, &interface_csr_ops);
+	starpu_data_register(handleptr, home_node, &csr_interface, &starpu_interface_csr_ops);
 }
 }
 
 
 static uint32_t footprint_csr_interface_crc32(starpu_data_handle_t handle)
 static uint32_t footprint_csr_interface_crc32(starpu_data_handle_t handle)

+ 39 - 0
src/datawizard/interfaces/data_interface.c

@@ -83,6 +83,40 @@ void _starpu_data_interface_shutdown()
 	registered_tag_handles = NULL;
 	registered_tag_handles = NULL;
 }
 }
 
 
+struct starpu_data_interface_ops *_starpu_data_interface_get_ops(unsigned interface_id)
+{
+	switch (interface_id)
+	{
+		case STARPU_MATRIX_INTERFACE_ID:
+			return &starpu_interface_matrix_ops;
+
+		case STARPU_BLOCK_INTERFACE_ID:
+			return &starpu_interface_block_ops;
+
+		case STARPU_VECTOR_INTERFACE_ID:
+			return &starpu_interface_vector_ops;
+
+		case STARPU_CSR_INTERFACE_ID:
+			return &starpu_interface_csr_ops;
+
+		case STARPU_BCSR_INTERFACE_ID:
+			return &starpu_interface_bcsr_ops;
+
+		case STARPU_VARIABLE_INTERFACE_ID:
+			return &starpu_interface_variable_ops;
+
+		case STARPU_VOID_INTERFACE_ID:
+			return &starpu_interface_void_ops;
+
+		case STARPU_MULTIFORMAT_INTERFACE_ID:
+			return &starpu_interface_multiformat_ops;
+
+		default:
+			STARPU_ABORT();
+			return NULL;
+	}
+}
+
 /* Register the mapping from PTR to HANDLE.  If PTR is already mapped to
 /* Register the mapping from PTR to HANDLE.  If PTR is already mapped to
  * some handle, the new mapping shadows the previous one.   */
  * some handle, the new mapping shadows the previous one.   */
 void _starpu_data_register_ram_pointer(starpu_data_handle_t handle, void *ptr)
 void _starpu_data_register_ram_pointer(starpu_data_handle_t handle, void *ptr)
@@ -598,6 +632,11 @@ static void _starpu_data_unregister(starpu_data_handle_t handle, unsigned cohere
 					break;
 					break;
 				}
 				}
 #endif
 #endif
+#ifdef STARPU_USE_MIC
+				case STARPU_MIC_RAM:
+					cl = mf_ops->mic_to_cpu_cl;
+					break;
+#endif
 				case STARPU_CPU_RAM:      /* Impossible ! */
 				case STARPU_CPU_RAM:      /* Impossible ! */
 				default:
 				default:
 					STARPU_ABORT();
 					STARPU_ABORT();

+ 10 - 0
src/datawizard/interfaces/data_interface.h

@@ -23,6 +23,14 @@
 
 
 /* Some data interfaces or filters use this interface internally */
 /* Some data interfaces or filters use this interface internally */
 extern struct starpu_data_interface_ops starpu_interface_matrix_ops;
 extern struct starpu_data_interface_ops starpu_interface_matrix_ops;
+extern struct starpu_data_interface_ops starpu_interface_block_ops;
+extern struct starpu_data_interface_ops starpu_interface_vector_ops;
+extern struct starpu_data_interface_ops starpu_interface_csr_ops;
+extern struct starpu_data_interface_ops starpu_interface_bcsr_ops;
+extern struct starpu_data_interface_ops starpu_interface_variable_ops;
+extern struct starpu_data_interface_ops starpu_interface_void_ops;
+extern struct starpu_data_interface_ops starpu_interface_multiformat_ops;
+
 void _starpu_data_free_interfaces(starpu_data_handle_t handle)
 void _starpu_data_free_interfaces(starpu_data_handle_t handle)
 	STARPU_ATTRIBUTE_INTERNAL;
 	STARPU_ATTRIBUTE_INTERNAL;
 
 
@@ -33,6 +41,8 @@ extern void _starpu_data_interface_init(void) STARPU_ATTRIBUTE_INTERNAL;
 extern int _starpu_data_check_not_busy(starpu_data_handle_t handle) STARPU_ATTRIBUTE_INTERNAL;
 extern int _starpu_data_check_not_busy(starpu_data_handle_t handle) STARPU_ATTRIBUTE_INTERNAL;
 extern void _starpu_data_interface_shutdown(void) STARPU_ATTRIBUTE_INTERNAL;
 extern void _starpu_data_interface_shutdown(void) STARPU_ATTRIBUTE_INTERNAL;
 
 
+struct starpu_data_interface_ops *_starpu_data_interface_get_ops(unsigned interface_id);
+
 extern void _starpu_data_register_ram_pointer(starpu_data_handle_t handle,
 extern void _starpu_data_register_ram_pointer(starpu_data_handle_t handle,
 						void *ptr)
 						void *ptr)
 	STARPU_ATTRIBUTE_INTERNAL;
 	STARPU_ATTRIBUTE_INTERNAL;

+ 2 - 0
src/datawizard/interfaces/matrix_filters.c

@@ -41,6 +41,7 @@ void starpu_matrix_filter_block(void *father_interface, void *child_interface, S
 						     &child_nx, &offset);
 						     &child_nx, &offset);
 
 
 	/* update the child's interface */
 	/* update the child's interface */
+	matrix_child->id = matrix_father->id;
 	matrix_child->nx = child_nx;
 	matrix_child->nx = child_nx;
 	matrix_child->ny = ny;
 	matrix_child->ny = ny;
 	matrix_child->elemsize = elemsize;
 	matrix_child->elemsize = elemsize;
@@ -115,6 +116,7 @@ void starpu_matrix_filter_vertical_block(void *father_interface, void *child_int
 						     matrix_father->ld,
 						     matrix_father->ld,
 						     &child_ny, &offset);
 						     &child_ny, &offset);
 
 
+	matrix_child->id = matrix_father->id;
 	matrix_child->nx = nx;
 	matrix_child->nx = nx;
 	matrix_child->ny = child_ny;
 	matrix_child->ny = child_ny;
 	matrix_child->elemsize = elemsize;
 	matrix_child->elemsize = elemsize;

+ 161 - 0
src/datawizard/interfaces/matrix_interface.c

@@ -24,6 +24,8 @@
 #include <starpu_cuda.h>
 #include <starpu_cuda.h>
 #include <starpu_opencl.h>
 #include <starpu_opencl.h>
 #include <drivers/opencl/driver_opencl.h>
 #include <drivers/opencl/driver_opencl.h>
+#include <drivers/scc/driver_scc_source.h>
+#include <drivers/mic/driver_mic_source.h>
 
 
 /* If you can promise that there is no stride in your matrices, you can define this */
 /* If you can promise that there is no stride in your matrices, you can define this */
 // #define NO_STRIDE
 // #define NO_STRIDE
@@ -47,6 +49,17 @@ static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARP
 static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cl_event *event);
 static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cl_event *event);
 static int copy_opencl_to_opencl_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cl_event *event);
 static int copy_opencl_to_opencl_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cl_event *event);
 #endif
 #endif
+#ifdef STARPU_USE_SCC
+static int copy_scc_src_to_sink(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
+static int copy_scc_sink_to_src(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
+static int copy_scc_sink_to_sink(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
+#endif
+#ifdef STARPU_USE_MIC
+static int copy_ram_to_mic(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
+static int copy_mic_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
+static int copy_ram_to_mic_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
+static int copy_mic_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
+#endif
 
 
 static const struct starpu_data_copy_methods matrix_copy_data_methods_s =
 static const struct starpu_data_copy_methods matrix_copy_data_methods_s =
 {
 {
@@ -127,6 +140,7 @@ static void register_matrix_handle(starpu_data_handle_t handle, unsigned home_no
 			local_interface->ld  = 0;
 			local_interface->ld  = 0;
 		}
 		}
 
 
+		local_interface->id = matrix_interface->id;
 		local_interface->nx = matrix_interface->nx;
 		local_interface->nx = matrix_interface->nx;
 		local_interface->ny = matrix_interface->ny;
 		local_interface->ny = matrix_interface->ny;
 		local_interface->elemsize = matrix_interface->elemsize;
 		local_interface->elemsize = matrix_interface->elemsize;
@@ -151,6 +165,7 @@ void starpu_matrix_data_register(starpu_data_handle_t *handleptr, unsigned home_
 {
 {
 	struct starpu_matrix_interface matrix_interface =
 	struct starpu_matrix_interface matrix_interface =
 	{
 	{
+		.id = STARPU_MATRIX_INTERFACE_ID,
 		.ptr = ptr,
 		.ptr = ptr,
 		.ld = ld,
 		.ld = ld,
 		.nx = nx,
 		.nx = nx,
@@ -160,6 +175,11 @@ void starpu_matrix_data_register(starpu_data_handle_t *handleptr, unsigned home_
                 .offset = 0
                 .offset = 0
 	};
 	};
 
 
+#ifdef STARPU_USE_SCC
+	_starpu_scc_set_offset_in_shared_memory((void*)matrix_interface.ptr,
+			(void**)&(matrix_interface.dev_handle), &(matrix_interface.offset));
+#endif
+
 	starpu_data_register(handleptr, home_node, &matrix_interface, &starpu_interface_matrix_ops);
 	starpu_data_register(handleptr, home_node, &matrix_interface, &starpu_interface_matrix_ops);
 }
 }
 
 
@@ -558,6 +578,147 @@ static int copy_opencl_to_opencl(void *src_interface, unsigned src_node STARPU_A
 
 
 #endif
 #endif
 
 
+#ifdef STARPU_USE_SCC
+static int copy_scc_src_to_sink(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
+{
+	uint32_t nx = STARPU_MATRIX_GET_NX(dst_interface);
+	uint32_t ny = STARPU_MATRIX_GET_NY(dst_interface);
+
+	size_t elemsize = STARPU_MATRIX_GET_ELEMSIZE(dst_interface);
+
+	uint32_t src_ld = STARPU_MATRIX_GET_LD(src_interface);
+	uint32_t dst_ld = STARPU_MATRIX_GET_LD(dst_interface);
+
+	void *src_ptr = (void *)STARPU_MATRIX_GET_PTR(src_interface);
+	void *dst_ptr = (void *)STARPU_MATRIX_GET_PTR(dst_interface);
+
+	unsigned y;
+	for (y = 0; y < ny; ++y)
+	{
+		uint32_t src_offset = y*src_ld*elemsize;
+		uint32_t dst_offset = y*dst_ld*elemsize;
+
+		_starpu_scc_copy_src_to_sink(src_ptr + src_offset, src_node,
+						dst_ptr + dst_offset, dst_node, nx*elemsize);
+	}
+
+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, nx*ny*elemsize);
+
+	return 0;
+}
+
+static int copy_scc_sink_to_src(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
+{
+	uint32_t nx = STARPU_MATRIX_GET_NX(dst_interface);
+	uint32_t ny = STARPU_MATRIX_GET_NY(dst_interface);
+
+	size_t elemsize = STARPU_MATRIX_GET_ELEMSIZE(dst_interface);
+
+	uint32_t src_ld = STARPU_MATRIX_GET_LD(src_interface);
+	uint32_t dst_ld = STARPU_MATRIX_GET_LD(dst_interface);
+
+	void *src_ptr = (void *)STARPU_MATRIX_GET_PTR(src_interface);
+	void *dst_ptr = (void *)STARPU_MATRIX_GET_PTR(dst_interface);
+
+	unsigned y;
+	for (y = 0; y < ny; ++y)
+	{
+		uint32_t src_offset = y*src_ld*elemsize;
+		uint32_t dst_offset = y*dst_ld*elemsize;
+
+		_starpu_scc_copy_sink_to_src(src_ptr + src_offset, src_node,
+						dst_ptr + dst_offset, dst_node, nx*elemsize);
+	}
+
+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, nx*ny*elemsize);
+
+	return 0;
+}
+
+static int copy_scc_sink_to_sink(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
+{
+	uint32_t nx = STARPU_MATRIX_GET_NX(dst_interface);
+	uint32_t ny = STARPU_MATRIX_GET_NY(dst_interface);
+
+	size_t elemsize = STARPU_MATRIX_GET_ELEMSIZE(dst_interface);
+
+	uint32_t src_ld = STARPU_MATRIX_GET_LD(src_interface);
+	uint32_t dst_ld = STARPU_MATRIX_GET_LD(dst_interface);
+
+	void *src_ptr = (void *)STARPU_MATRIX_GET_PTR(src_interface);
+	void *dst_ptr = (void *)STARPU_MATRIX_GET_PTR(dst_interface);
+
+	unsigned y;
+	for (y = 0; y < ny; ++y)
+	{
+		uint32_t src_offset = y*src_ld*elemsize;
+		uint32_t dst_offset = y*dst_ld*elemsize;
+
+		_starpu_scc_copy_sink_to_sink(src_ptr + src_offset, src_node,
+						dst_ptr + dst_offset, dst_node, nx*elemsize);
+	}
+
+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, nx*ny*elemsize);
+
+	return 0;
+}
+#endif /* STARPU_USE_SCC */
+
+#ifdef STARPU_USE_MIC
+static int copy_mic_common(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node,
+						   int (*copy_func)(void *, unsigned, void *, unsigned, size_t))
+{
+	struct starpu_matrix_interface *src_matrix = src_interface;
+	struct starpu_matrix_interface *dst_matrix = dst_interface;
+
+	unsigned y;
+	uint32_t nx = dst_matrix->nx;
+	uint32_t ny = dst_matrix->ny;
+	size_t elemsize = dst_matrix->elemsize;
+
+	uint32_t ld_src = src_matrix->ld;
+	uint32_t ld_dst = dst_matrix->ld;
+
+	uintptr_t ptr_src = src_matrix->ptr;
+	uintptr_t ptr_dst = dst_matrix->ptr;
+
+
+	for (y = 0; y < ny; y++)
+	{
+		uint32_t src_offset = y*ld_src*elemsize;
+		uint32_t dst_offset = y*ld_dst*elemsize;
+
+		copy_func((void *)(ptr_src + src_offset), src_node, (void *)(ptr_dst + dst_offset), dst_node, nx*elemsize);
+	}
+
+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, (size_t)nx*ny*elemsize);
+
+	return 0;
+}
+
+static int copy_ram_to_mic(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
+{
+	return copy_mic_common(src_interface, src_node, dst_interface, dst_node, _starpu_mic_copy_ram_to_mic);
+}
+
+static int copy_mic_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
+{
+	return copy_mic_common(src_interface, src_node, dst_interface, dst_node, _starpu_mic_copy_mic_to_ram);
+}
+
+static int copy_ram_to_mic_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
+{
+	copy_mic_common(src_interface, src_node, dst_interface, dst_node, _starpu_mic_copy_ram_to_mic_async);
+	return -EAGAIN;
+}
+
+static int copy_mic_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
+{
+	copy_mic_common(src_interface, src_node, dst_interface, dst_node, _starpu_mic_copy_mic_to_ram_async);
+	return -EAGAIN;
+}
+#endif
+
 /* as not all platform easily have a  lib installed ... */
 /* as not all platform easily have a  lib installed ... */
 static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
 static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
 {
 {

+ 105 - 2
src/datawizard/interfaces/multiformat_interface.c

@@ -23,6 +23,7 @@
 #include <starpu_cuda.h>
 #include <starpu_cuda.h>
 #include <starpu_opencl.h>
 #include <starpu_opencl.h>
 #include <drivers/opencl/driver_opencl.h>
 #include <drivers/opencl/driver_opencl.h>
+#include <drivers/mic/driver_mic_source.h>
 #include <core/task.h>
 #include <core/task.h>
 
 
 static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node);
 static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node);
@@ -41,6 +42,12 @@ static int copy_opencl_to_opencl(void *src_interface, unsigned src_node STARPU_A
 static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node, cl_event *event);
 static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node, cl_event *event);
 static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node, cl_event *event);
 static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node, cl_event *event);
 #endif
 #endif
+#ifdef STARPU_USE_MIC
+static int copy_ram_to_mic(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
+static int copy_mic_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
+static int copy_ram_to_mic_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
+static int copy_mic_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
+#endif
 
 
 static const struct starpu_data_copy_methods multiformat_copy_data_methods_s =
 static const struct starpu_data_copy_methods multiformat_copy_data_methods_s =
 {
 {
@@ -65,6 +72,12 @@ static const struct starpu_data_copy_methods multiformat_copy_data_methods_s =
         .ram_to_opencl_async = copy_ram_to_opencl_async,
         .ram_to_opencl_async = copy_ram_to_opencl_async,
 	.opencl_to_ram_async = copy_opencl_to_ram_async,
 	.opencl_to_ram_async = copy_opencl_to_ram_async,
 #endif
 #endif
+#ifdef STARPU_USE_MIC
+	.ram_to_mic = copy_ram_to_mic,
+	.mic_to_ram = copy_mic_to_ram,
+	.ram_to_mic_async = copy_ram_to_mic_async,
+	.mic_to_ram_async = copy_mic_to_ram_async,
+#endif
 };
 };
 
 
 static void register_multiformat_handle(starpu_data_handle_t handle, unsigned home_node, void *data_interface);
 static void register_multiformat_handle(starpu_data_handle_t handle, unsigned home_node, void *data_interface);
@@ -86,7 +99,7 @@ get_mf_ops(void *data_interface)
 	return mf->ops;
 	return mf->ops;
 }
 }
 
 
-static struct starpu_data_interface_ops interface_multiformat_ops =
+struct starpu_data_interface_ops starpu_interface_multiformat_ops =
 {
 {
 	.register_data_handle  = register_multiformat_handle,
 	.register_data_handle  = register_multiformat_handle,
 	.allocate_data_on_node = allocate_multiformat_buffer_on_node,
 	.allocate_data_on_node = allocate_multiformat_buffer_on_node,
@@ -121,6 +134,10 @@ static void *multiformat_handle_to_pointer(starpu_data_handle_t handle, unsigned
 		case STARPU_OPENCL_RAM:
 		case STARPU_OPENCL_RAM:
 			return multiformat_interface->opencl_ptr;
 			return multiformat_interface->opencl_ptr;
 #endif
 #endif
+#ifdef STARPU_USE_MIC
+		case STARPU_MIC_RAM:
+			return multiformat_interface->mic_ptr;
+#endif
 		default:
 		default:
 			STARPU_ABORT();
 			STARPU_ABORT();
 	}
 	}
@@ -147,6 +164,9 @@ static void register_multiformat_handle(starpu_data_handle_t handle, unsigned ho
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
 			local_interface->opencl_ptr = multiformat_interface->opencl_ptr;
 			local_interface->opencl_ptr = multiformat_interface->opencl_ptr;
 #endif
 #endif
+#ifdef STARPU_USE_MIC
+			local_interface->mic_ptr    = multiformat_interface->mic_ptr;
+#endif
 		}
 		}
 		else
 		else
 		{
 		{
@@ -157,7 +177,11 @@ static void register_multiformat_handle(starpu_data_handle_t handle, unsigned ho
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
 			local_interface->opencl_ptr = NULL;
 			local_interface->opencl_ptr = NULL;
 #endif
 #endif
+#ifdef STARPU_USE_MIC
+			local_interface->mic_ptr    = NULL;
+#endif
 		}
 		}
+		local_interface->id = multiformat_interface->id;
 		local_interface->nx = multiformat_interface->nx;
 		local_interface->nx = multiformat_interface->nx;
 		local_interface->ops = multiformat_interface->ops;
 		local_interface->ops = multiformat_interface->ops;
 	}
 	}
@@ -173,17 +197,23 @@ void starpu_multiformat_data_register(starpu_data_handle_t *handleptr,
 	_starpu_codelet_check_deprecated_fields(format_ops->opencl_to_cpu_cl);
 	_starpu_codelet_check_deprecated_fields(format_ops->opencl_to_cpu_cl);
 	_starpu_codelet_check_deprecated_fields(format_ops->cpu_to_cuda_cl);
 	_starpu_codelet_check_deprecated_fields(format_ops->cpu_to_cuda_cl);
 	_starpu_codelet_check_deprecated_fields(format_ops->cuda_to_cpu_cl);
 	_starpu_codelet_check_deprecated_fields(format_ops->cuda_to_cpu_cl);
+	_starpu_codelet_check_deprecated_fields(format_ops->cpu_to_mic_cl);
+	_starpu_codelet_check_deprecated_fields(format_ops->mic_to_cpu_cl);
+	_starpu_codelet_check_deprecated_fields(format_ops->cpu_to_scc_cl);
+	_starpu_codelet_check_deprecated_fields(format_ops->scc_to_cpu_cl);
 
 
 	struct starpu_multiformat_interface multiformat =
 	struct starpu_multiformat_interface multiformat =
 	{
 	{
+		.id         = STARPU_MULTIFORMAT_INTERFACE_ID,
 		.cpu_ptr    = ptr,
 		.cpu_ptr    = ptr,
 		.cuda_ptr   = NULL,
 		.cuda_ptr   = NULL,
 		.opencl_ptr = NULL,
 		.opencl_ptr = NULL,
+		.mic_ptr    = NULL,
 		.nx         = nobjects,
 		.nx         = nobjects,
 		.ops        = format_ops
 		.ops        = format_ops
 	};
 	};
 
 
-	starpu_data_register(handleptr, home_node, &multiformat, &interface_multiformat_ops);
+	starpu_data_register(handleptr, home_node, &multiformat, &starpu_interface_multiformat_ops);
 }
 }
 
 
 static uint32_t footprint_multiformat_interface_crc32(starpu_data_handle_t handle)
 static uint32_t footprint_multiformat_interface_crc32(starpu_data_handle_t handle)
@@ -204,6 +234,12 @@ static int multiformat_compare(void *data_interface_a, void *data_interface_b)
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
 			&& (multiformat_a->ops->opencl_elemsize == multiformat_b->ops->opencl_elemsize)
 			&& (multiformat_a->ops->opencl_elemsize == multiformat_b->ops->opencl_elemsize)
 #endif
 #endif
+#ifdef STARPU_USE_MIC
+		    && (multiformat_a->ops->mic_elemsize == multiformat_b->ops->mic_elemsize)
+#endif
+#ifdef STARPU_USE_SCC
+		    && (multiformat_a->ops->scc_elemsize == multiformat_b->ops->scc_elemsize)
+#endif
 		);
 		);
 }
 }
 
 
@@ -645,3 +681,70 @@ static int copy_opencl_to_opencl(void *src_interface, unsigned src_node,
 	return 0;
 	return 0;
 }
 }
 #endif
 #endif
+
+#ifdef STARPU_USE_MIC
+static int copy_mic_common_ram_to_mic(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node,
+						   int (*copy_func)(void *, unsigned, void *, unsigned, size_t))
+{
+	struct starpu_multiformat_interface *src_multiformat = src_interface;
+	struct starpu_multiformat_interface *dst_multiformat = dst_interface;
+
+	STARPU_ASSERT(src_multiformat != NULL);
+	STARPU_ASSERT(dst_multiformat != NULL);
+	STARPU_ASSERT(dst_multiformat->ops != NULL);
+
+	size_t size = dst_multiformat->nx * dst_multiformat->ops->mic_elemsize;
+	if (src_multiformat->mic_ptr == NULL)
+	{
+		src_multiformat->mic_ptr = malloc(size);
+		if (src_multiformat->mic_ptr == NULL)
+			return -ENOMEM;
+	}
+	
+	copy_func(src_multiformat->cpu_ptr, src_node, dst_multiformat->cpu_ptr, dst_node, size);
+
+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, size);
+
+	return 0;
+}
+
+static int copy_mic_common_mic_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node,
+						   int (*copy_func)(void *, unsigned, void *, unsigned, size_t))
+{
+	struct starpu_multiformat_interface *src_multiformat = src_interface;
+	struct starpu_multiformat_interface *dst_multiformat = dst_interface;
+
+	STARPU_ASSERT(src_multiformat != NULL);
+	STARPU_ASSERT(dst_multiformat != NULL);
+	STARPU_ASSERT(dst_multiformat->ops != NULL);
+			
+	size_t size = src_multiformat->nx * src_multiformat->ops->mic_elemsize;
+	copy_func(src_multiformat->mic_ptr, src_node, dst_multiformat->mic_ptr, dst_node, size);
+
+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, size);
+
+	return 0;
+}
+
+static int copy_ram_to_mic(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
+{
+	return copy_mic_common_ram_to_mic(src_interface, src_node, dst_interface, dst_node, _starpu_mic_copy_ram_to_mic);
+}
+
+static int copy_mic_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
+{
+	return copy_mic_common_mic_to_ram(src_interface, src_node, dst_interface, dst_node, _starpu_mic_copy_mic_to_ram);
+}
+
+static int copy_ram_to_mic_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
+{
+	copy_mic_common_ram_to_mic(src_interface, src_node, dst_interface, dst_node, _starpu_mic_copy_ram_to_mic_async);
+	return -EAGAIN;
+}
+
+static int copy_mic_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
+{
+	copy_mic_common_mic_to_ram(src_interface, src_node, dst_interface, dst_node, _starpu_mic_copy_mic_to_ram_async);
+	return -EAGAIN;
+}
+#endif

+ 20 - 4
src/datawizard/interfaces/variable_interface.c

@@ -24,6 +24,8 @@
 #include <starpu_cuda.h>
 #include <starpu_cuda.h>
 #include <starpu_opencl.h>
 #include <starpu_opencl.h>
 #include <drivers/opencl/driver_opencl.h>
 #include <drivers/opencl/driver_opencl.h>
+#include <drivers/scc/driver_scc_source.h>
+#include <drivers/mic/driver_mic_source.h>
 
 
 static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
 static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
 
 
@@ -41,7 +43,7 @@ static uint32_t footprint_variable_interface_crc32(starpu_data_handle_t handle);
 static int variable_compare(void *data_interface_a, void *data_interface_b);
 static int variable_compare(void *data_interface_a, void *data_interface_b);
 static void display_variable_interface(starpu_data_handle_t handle, FILE *f);
 static void display_variable_interface(starpu_data_handle_t handle, FILE *f);
 
 
-static struct starpu_data_interface_ops interface_variable_ops =
+struct starpu_data_interface_ops starpu_interface_variable_ops =
 {
 {
 	.register_data_handle = register_variable_handle,
 	.register_data_handle = register_variable_handle,
 	.allocate_data_on_node = allocate_variable_buffer_on_node,
 	.allocate_data_on_node = allocate_variable_buffer_on_node,
@@ -65,6 +67,7 @@ static void *variable_handle_to_pointer(starpu_data_handle_t handle, unsigned no
 
 
 static void register_variable_handle(starpu_data_handle_t handle, unsigned home_node, void *data_interface)
 static void register_variable_handle(starpu_data_handle_t handle, unsigned home_node, void *data_interface)
 {
 {
+	struct starpu_variable_interface *variable_interface = (struct starpu_variable_interface *)data_interface;
 	unsigned node;
 	unsigned node;
 	for (node = 0; node < STARPU_MAXNODES; node++)
 	for (node = 0; node < STARPU_MAXNODES; node++)
 	{
 	{
@@ -73,14 +76,19 @@ static void register_variable_handle(starpu_data_handle_t handle, unsigned home_
 
 
 		if (node == home_node)
 		if (node == home_node)
 		{
 		{
-			local_interface->ptr = STARPU_VARIABLE_GET_PTR(data_interface);
+			local_interface->ptr = variable_interface->ptr;
+			local_interface->dev_handle = variable_interface->dev_handle;
+			local_interface->offset = variable_interface->offset;
 		}
 		}
 		else
 		else
 		{
 		{
 			local_interface->ptr = 0;
 			local_interface->ptr = 0;
+			local_interface->dev_handle = 0;
+			local_interface->offset = 0;
 		}
 		}
 
 
-		local_interface->elemsize = STARPU_VARIABLE_GET_ELEMSIZE(data_interface);
+		local_interface->id = variable_interface->id;
+		local_interface->elemsize = variable_interface->elemsize;
 	}
 	}
 }
 }
 
 
@@ -90,11 +98,19 @@ void starpu_variable_data_register(starpu_data_handle_t *handleptr, unsigned hom
 {
 {
 	struct starpu_variable_interface variable =
 	struct starpu_variable_interface variable =
 	{
 	{
+		.id = STARPU_VARIABLE_INTERFACE_ID,
 		.ptr = ptr,
 		.ptr = ptr,
+		.dev_handle = ptr,
+		.offset = 0,
 		.elemsize = elemsize
 		.elemsize = elemsize
 	};
 	};
 
 
-	starpu_data_register(handleptr, home_node, &variable, &interface_variable_ops);
+#ifdef STARPU_USE_SCC
+	_starpu_scc_set_offset_in_shared_memory((void*)variable.ptr, (void**)&(variable.dev_handle),
+			&(variable.offset));
+#endif
+
+	starpu_data_register(handleptr, home_node, &variable, &starpu_interface_variable_ops);
 }
 }
 
 
 
 

+ 4 - 0
src/datawizard/interfaces/vector_filters.c

@@ -35,6 +35,7 @@ void starpu_vector_filter_block(void *father_interface, void *child_interface, S
 	_starpu_filter_nparts_compute_chunk_size_and_offset(nx, nchunks, elemsize, id, 1,
 	_starpu_filter_nparts_compute_chunk_size_and_offset(nx, nchunks, elemsize, id, 1,
 						     &child_nx, &offset);
 						     &child_nx, &offset);
 
 
+	vector_child->id = vector_father->id;
 	vector_child->nx = child_nx;
 	vector_child->nx = child_nx;
 	vector_child->elemsize = elemsize;
 	vector_child->elemsize = elemsize;
 
 
@@ -95,6 +96,8 @@ void starpu_vector_filter_divide_in_2(void *father_interface, void *child_interf
 
 
 	STARPU_ASSERT_MSG(length_first < nx, "First part is too long: %u vs %u", length_first, nx);
 	STARPU_ASSERT_MSG(length_first < nx, "First part is too long: %u vs %u", length_first, nx);
 
 
+	vector_child->id = vector_father->id;
+
 	/* this is the first child */
 	/* this is the first child */
 	if (id == 0)
 	if (id == 0)
 	{
 	{
@@ -138,6 +141,7 @@ void starpu_vector_filter_list(void *father_interface, void *child_interface, st
 
 
 	uint32_t chunk_size = length_tab[id];
 	uint32_t chunk_size = length_tab[id];
 
 
+	vector_child->id = vector_father->id;
 	vector_child->nx = chunk_size;
 	vector_child->nx = chunk_size;
 	vector_child->elemsize = elemsize;
 	vector_child->elemsize = elemsize;
 
 

+ 10 - 2
src/datawizard/interfaces/vector_interface.c

@@ -24,6 +24,8 @@
 #include <starpu_cuda.h>
 #include <starpu_cuda.h>
 #include <starpu_opencl.h>
 #include <starpu_opencl.h>
 #include <drivers/opencl/driver_opencl.h>
 #include <drivers/opencl/driver_opencl.h>
+#include <drivers/mic/driver_mic_source.h>
+#include <drivers/scc/driver_scc_source.h>
 
 
 static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
 static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
 
 
@@ -41,7 +43,7 @@ static uint32_t footprint_vector_interface_crc32(starpu_data_handle_t handle);
 static int vector_compare(void *data_interface_a, void *data_interface_b);
 static int vector_compare(void *data_interface_a, void *data_interface_b);
 static void display_vector_interface(starpu_data_handle_t handle, FILE *f);
 static void display_vector_interface(starpu_data_handle_t handle, FILE *f);
 
 
-static struct starpu_data_interface_ops interface_vector_ops =
+struct starpu_data_interface_ops starpu_interface_vector_ops =
 {
 {
 	.register_data_handle = register_vector_handle,
 	.register_data_handle = register_vector_handle,
 	.allocate_data_on_node = allocate_vector_buffer_on_node,
 	.allocate_data_on_node = allocate_vector_buffer_on_node,
@@ -89,6 +91,7 @@ static void register_vector_handle(starpu_data_handle_t handle, unsigned home_no
                         local_interface->offset = 0;
                         local_interface->offset = 0;
 		}
 		}
 
 
+		local_interface->id = vector_interface->id;
 		local_interface->nx = vector_interface->nx;
 		local_interface->nx = vector_interface->nx;
 		local_interface->elemsize = vector_interface->elemsize;
 		local_interface->elemsize = vector_interface->elemsize;
 	}
 	}
@@ -100,6 +103,7 @@ void starpu_vector_data_register(starpu_data_handle_t *handleptr, unsigned home_
 {
 {
 	struct starpu_vector_interface vector =
 	struct starpu_vector_interface vector =
 	{
 	{
+		.id = STARPU_VECTOR_INTERFACE_ID,
 		.ptr = ptr,
 		.ptr = ptr,
 		.nx = nx,
 		.nx = nx,
 		.elemsize = elemsize,
 		.elemsize = elemsize,
@@ -107,7 +111,11 @@ void starpu_vector_data_register(starpu_data_handle_t *handleptr, unsigned home_
                 .offset = 0
                 .offset = 0
 	};
 	};
 
 
-	starpu_data_register(handleptr, home_node, &vector, &interface_vector_ops);
+#ifdef STARPU_USE_SCC
+	_starpu_scc_set_offset_in_shared_memory((void*)vector.ptr, (void**)&(vector.dev_handle), &(vector.offset));
+#endif
+
+	starpu_data_register(handleptr, home_node, &vector, &starpu_interface_vector_ops);
 }
 }
 
 
 
 

+ 2 - 2
src/datawizard/interfaces/void_interface.c

@@ -40,7 +40,7 @@ static uint32_t footprint_void_interface_crc32(starpu_data_handle_t handle);
 static int void_compare(void *data_interface_a, void *data_interface_b);
 static int void_compare(void *data_interface_a, void *data_interface_b);
 static void display_void_interface(starpu_data_handle_t handle, FILE *f);
 static void display_void_interface(starpu_data_handle_t handle, FILE *f);
 
 
-static struct starpu_data_interface_ops interface_void_ops =
+struct starpu_data_interface_ops starpu_interface_void_ops =
 {
 {
 	.register_data_handle = register_void_handle,
 	.register_data_handle = register_void_handle,
 	.allocate_data_on_node = allocate_void_buffer_on_node,
 	.allocate_data_on_node = allocate_void_buffer_on_node,
@@ -64,7 +64,7 @@ static void register_void_handle(starpu_data_handle_t handle STARPU_ATTRIBUTE_UN
 /* declare a new data with the void interface */
 /* declare a new data with the void interface */
 void starpu_void_data_register(starpu_data_handle_t *handleptr)
 void starpu_void_data_register(starpu_data_handle_t *handleptr)
 {
 {
-	starpu_data_register(handleptr, 0, NULL, &interface_void_ops);
+	starpu_data_register(handleptr, 0, NULL, &starpu_interface_void_ops);
 }
 }
 
 
 
 

+ 35 - 0
src/datawizard/malloc.c

@@ -177,6 +177,13 @@ int starpu_malloc_flags(void **A, size_t dim, int flags)
 	}
 	}
 #endif /* STARPU_SIMGRID */
 #endif /* STARPU_SIMGRID */
 
 
+	if (_starpu_can_submit_scc_task())
+	{
+#ifdef STARPU_USE_SCC
+		_starpu_scc_allocate_shared_memory(A, dim);
+#endif
+	}
+	else
 #ifdef STARPU_HAVE_POSIX_MEMALIGN
 #ifdef STARPU_HAVE_POSIX_MEMALIGN
 	if (_malloc_align != sizeof(void*))
 	if (_malloc_align != sizeof(void*))
 	{
 	{
@@ -318,6 +325,12 @@ int starpu_free_flags(void *A, size_t dim, int flags)
 	}
 	}
 #endif /* STARPU_SIMGRID */
 #endif /* STARPU_SIMGRID */
 
 
+	if (_starpu_can_submit_scc_task())
+	{
+#ifdef STARPU_USE_SCC
+		_starpu_scc_free_shared_memory(A, dim);
+#endif
+	} else
 	free(A);
 	free(A);
 
 
 out:
 out:
@@ -406,6 +419,18 @@ starpu_malloc_on_node(unsigned dst_node, size_t size)
 #endif
 #endif
 			}
 			}
 #endif
 #endif
+#ifdef STARPU_USE_MIC
+		case STARPU_MIC_RAM:
+			if (_starpu_mic_allocate_memory((void **)(&addr), size, dst_node))
+				addr = 0;
+			break;
+#endif
+#ifdef STARPU_USE_SCC
+		case STARPU_SCC_RAM:
+			if (_starpu_scc_allocate_memory((void **)(&addr), size, dst_node))
+				addr = 0;
+			break;
+#endif
 		default:
 		default:
 			STARPU_ABORT();
 			STARPU_ABORT();
 	}
 	}
@@ -461,6 +486,16 @@ starpu_free_on_node(unsigned dst_node, uintptr_t addr, size_t size)
                         break;
                         break;
 		}
 		}
 #endif
 #endif
+#ifdef STARPU_USE_MIC
+		case STARPU_MIC_RAM:
+			_starpu_mic_free_memory((void*) addr, size, dst_node);
+			break;
+#endif
+#ifdef STARPU_USE_SCC
+		case STARPU_SCC_RAM:
+			_starpu_scc_free_memory((void *) addr, size, dst_node);
+			break;
+#endif
 		default:
 		default:
 			STARPU_ABORT();
 			STARPU_ABORT();
 	}
 	}

+ 24 - 1
src/datawizard/reduction.c

@@ -20,6 +20,8 @@
 #include <util/starpu_data_cpy.h>
 #include <util/starpu_data_cpy.h>
 #include <core/task.h>
 #include <core/task.h>
 #include <datawizard/datawizard.h>
 #include <datawizard/datawizard.h>
+#include <drivers/mic/driver_mic_source.h>
+#include <drivers/mp_common/source_common.h>
 
 
 void starpu_data_set_reduction_methods(starpu_data_handle_t handle,
 void starpu_data_set_reduction_methods(starpu_data_handle_t handle,
 				       struct starpu_codelet *redux_cl,
 				       struct starpu_codelet *redux_cl,
@@ -68,6 +70,12 @@ void _starpu_redux_init_data_replicate(starpu_data_handle_t handle, struct _star
 		case STARPU_OPENCL_WORKER:
 		case STARPU_OPENCL_WORKER:
 			init_func = _starpu_task_get_opencl_nth_implementation(init_cl, 0);
 			init_func = _starpu_task_get_opencl_nth_implementation(init_cl, 0);
 			break;
 			break;
+#ifdef STARPU_USE_MIC
+		case STARPU_MIC_WORKER:
+			init_func = _starpu_mic_src_get_kernel_from_codelet(init_cl, 0);
+			break;
+#endif
+			/* TODO: SCC */
 		default:
 		default:
 			STARPU_ABORT();
 			STARPU_ABORT();
 			break;
 			break;
@@ -75,7 +83,22 @@ void _starpu_redux_init_data_replicate(starpu_data_handle_t handle, struct _star
 
 
 	STARPU_ASSERT(init_func);
 	STARPU_ASSERT(init_func);
 
 
-	init_func(&replicate->data_interface, NULL);
+#ifdef STARPU_USE_MIC
+	if (starpu_worker_get_type(workerid) == STARPU_MIC_WORKER)
+	{
+		const struct _starpu_mp_node *node = _starpu_mic_src_get_actual_thread_mp_node();
+
+		// XXX: give the correct coreid.
+		_starpu_src_common_execute_kernel(node,
+						  (void(*)(void))init_func, 0,
+						  &handle, &(replicate->data_interface), 1,
+						  NULL, 0);
+	}
+	else
+#endif
+	{
+		init_func(&replicate->data_interface, NULL);
+	}
 
 
 	replicate->initialized = 1;
 	replicate->initialized = 1;
 }
 }

+ 19 - 0
src/debug/traces/starpu_fxt.c

@@ -31,12 +31,16 @@
 static char *cpus_worker_colors[STARPU_NMAXWORKERS] = {"/greens9/7", "/greens9/6", "/greens9/5", "/greens9/4",  "/greens9/9", "/greens9/3",  "/greens9/2",  "/greens9/1"  };
 static char *cpus_worker_colors[STARPU_NMAXWORKERS] = {"/greens9/7", "/greens9/6", "/greens9/5", "/greens9/4",  "/greens9/9", "/greens9/3",  "/greens9/2",  "/greens9/1"  };
 static char *cuda_worker_colors[STARPU_NMAXWORKERS] = {"/ylorrd9/9", "/ylorrd9/6", "/ylorrd9/3", "/ylorrd9/1", "/ylorrd9/8", "/ylorrd9/7", "/ylorrd9/4", "/ylorrd9/2",  "/ylorrd9/1"};
 static char *cuda_worker_colors[STARPU_NMAXWORKERS] = {"/ylorrd9/9", "/ylorrd9/6", "/ylorrd9/3", "/ylorrd9/1", "/ylorrd9/8", "/ylorrd9/7", "/ylorrd9/4", "/ylorrd9/2",  "/ylorrd9/1"};
 static char *opencl_worker_colors[STARPU_NMAXWORKERS] = {"/blues9/9", "/blues9/6", "/blues9/3", "/blues9/1", "/blues9/8", "/blues9/7", "/blues9/4", "/blues9/2",  "/blues9/1"};
 static char *opencl_worker_colors[STARPU_NMAXWORKERS] = {"/blues9/9", "/blues9/6", "/blues9/3", "/blues9/1", "/blues9/8", "/blues9/7", "/blues9/4", "/blues9/2",  "/blues9/1"};
+static char *mic_worker_colors[STARPU_NMAXWORKERS] = {"/reds9/9", "/reds9/6", "/reds9/3", "/reds9/1", "/reds9/8", "/reds9/7", "/reds9/4", "/reds9/2",  "/reds9/1"};
+static char *scc_worker_colors[STARPU_NMAXWORKERS] = {"/reds9/9", "/reds9/6", "/reds9/3", "/reds9/1", "/reds9/8", "/reds9/7", "/reds9/4", "/reds9/2",  "/reds9/1"};
 static char *other_worker_colors[STARPU_NMAXWORKERS] = {"/greys9/9", "/greys9/8", "/greys9/7", "/greys9/6"};
 static char *other_worker_colors[STARPU_NMAXWORKERS] = {"/greys9/9", "/greys9/8", "/greys9/7", "/greys9/6"};
 static char *worker_colors[STARPU_NMAXWORKERS];
 static char *worker_colors[STARPU_NMAXWORKERS];
 
 
 static unsigned opencl_index = 0;
 static unsigned opencl_index = 0;
 static unsigned cuda_index = 0;
 static unsigned cuda_index = 0;
 static unsigned cpus_index = 0;
 static unsigned cpus_index = 0;
+static unsigned mic_index = 0;
+static unsigned scc_index = 0;
 static unsigned other_index = 0;
 static unsigned other_index = 0;
 
 
 static void set_next_other_worker_color(int workerid)
 static void set_next_other_worker_color(int workerid)
@@ -59,6 +63,11 @@ static void set_next_opencl_worker_color(int workerid)
 	worker_colors[workerid] = opencl_worker_colors[opencl_index++];
 	worker_colors[workerid] = opencl_worker_colors[opencl_index++];
 }
 }
 
 
+static void set_next_mic_worker_color(int workerid)
+{
+	worker_colors[workerid] = mic_worker_colors[mic_index++];
+}
+
 static const char *get_worker_color(int workerid)
 static const char *get_worker_color(int workerid)
 {
 {
 	return worker_colors[workerid];
 	return worker_colors[workerid];
@@ -345,6 +354,16 @@ static void handle_worker_init_start(struct fxt_ev_64 *ev, struct starpu_fxt_opt
 			kindstr = "OPENCL";
 			kindstr = "OPENCL";
 			archtype = STARPU_OPENCL_DEFAULT + devid;
 			archtype = STARPU_OPENCL_DEFAULT + devid;
 			break;
 			break;
+		case _STARPU_FUT_MIC_KEY:
+			set_next_mic_worker_color(workerid);
+			kindstr = "mic";
+			archtype = STARPU_MIC_DEFAULT + devid;
+			break;
+		case _STARPU_FUT_SCC_KEY:
+			set_next_scc_worker_color(workerid);
+			kindstr = "scc";
+			archtype = STARPU_SCC_DEFAULT + devid;
+			break;
 		default:
 		default:
 			STARPU_ABORT();
 			STARPU_ABORT();
 	}
 	}

+ 2 - 1
src/drivers/gordon/driver_gordon.c

@@ -374,7 +374,8 @@ void *gordon_worker_inject(struct _starpu_worker_set *arg)
 					struct _starpu_job_list *chunk_list;
 					struct _starpu_job_list *chunk_list;
 					if (chunk != (nchunks -1))
 					if (chunk != (nchunks -1))
 					{
 					{
-						/* split the list in 2 parts : list = chunk_list | tail */
+						/* split the list in 2 parts :
+						 * list = chunk_list | tail */
 						chunk_list = _starpu_job_list_new();
 						chunk_list = _starpu_job_list_new();
 
 
 						/* find the end */
 						/* find the end */

+ 116 - 0
src/drivers/mic/driver_mic_common.c

@@ -0,0 +1,116 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012  Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+
+#include <starpu.h>
+#include <drivers/mp_common/mp_common.h>
+#include <drivers/mic/driver_mic_common.h>
+
+
+void _starpu_mic_common_report_scif_error(const char *func, const char *file, const int line, const int status)
+{
+	const char *errormsg = strerror(status);
+	printf("Common: oops in %s (%s:%u)... %d: %s \n", func, file, line, status, errormsg);
+	STARPU_ASSERT(0);
+}
+
+/* Handles the error so the caller (which must be generic) doesn't have to
+ * care about it.
+ */
+
+void _starpu_mic_common_send(const struct _starpu_mp_node *node, void *msg, int len)
+{
+	if ((scif_send(node->mp_connection.mic_endpoint, msg, len, SCIF_SEND_BLOCK)) < 0)
+		STARPU_MP_COMMON_REPORT_ERROR(node, errno);
+}
+
+/* Handles the error so the caller (which must be generic) doesn't have to
+ * care about it.
+ */
+
+void _starpu_mic_common_recv(const struct _starpu_mp_node *node, void *msg, int len)
+{
+	if ((scif_recv(node->mp_connection.mic_endpoint, msg, len, SCIF_RECV_BLOCK)) < 0)
+		STARPU_MP_COMMON_REPORT_ERROR(node, errno);
+}
+
+/* Handles the error so the caller (which must be generic) doesn't have to
+ * care about it.
+ */
+void _starpu_mic_common_dt_send(const struct _starpu_mp_node *mp_node, void *msg, int len)
+{
+	if ((scif_send(mp_node->host_sink_dt_connection.mic_endpoint, msg, len, SCIF_SEND_BLOCK)) < 0)
+		STARPU_MP_COMMON_REPORT_ERROR(mp_node, errno);
+}
+
+/* Handles the error so the caller (which must be generic) doesn't have to
+ * care about it.
+ */
+void _starpu_mic_common_dt_recv(const struct _starpu_mp_node *mp_node, void *msg, int len)
+{
+	if ((scif_recv(mp_node->host_sink_dt_connection.mic_endpoint, msg, len, SCIF_SEND_BLOCK)) < 0)
+		STARPU_MP_COMMON_REPORT_ERROR(mp_node, errno);
+}
+
+void _starpu_mic_common_connect(scif_epd_t *endpoint, uint16_t remote_node,
+				uint16_t local_port_number, uint16_t remote_port_number)
+{
+	/* Endpoint only useful for the initialization of the connection */
+	struct scif_portID portID;
+
+	portID.node = remote_node;
+	portID.port = remote_port_number;
+
+	if ((*endpoint = scif_open()) < 0)
+		STARPU_MIC_COMMON_REPORT_SCIF_ERROR(errno);
+
+	if ((scif_bind(*endpoint, local_port_number)) < 0)
+		STARPU_MIC_COMMON_REPORT_SCIF_ERROR(errno);
+
+	while (scif_connect(*endpoint, &portID) != 0)
+	{
+		if (errno != ECONNREFUSED)
+			STARPU_MIC_COMMON_REPORT_SCIF_ERROR(errno);
+	}
+}
+
+/* Wait and accept the connection from the wanted device on the port PORT_NUMBER
+ * and then initialize the connection, the resutling endpoint is stored in ENDPOINT */
+void _starpu_mic_common_accept(scif_epd_t *endpoint, uint16_t port_number)
+{
+	/* Unused variables, only useful to make scif_accept don't cause
+	 * a seg fault when trying to access PEER parameter */
+	struct scif_portID portID;
+
+	/* Endpoint only useful for the initialization of the connection */
+	int init_epd;
+
+	if ((init_epd = scif_open()) < 0)
+		STARPU_MIC_COMMON_REPORT_SCIF_ERROR(errno);
+
+	if ((scif_bind(init_epd, port_number)) < 0)
+		STARPU_MIC_COMMON_REPORT_SCIF_ERROR(errno);
+
+	/* We fix the maximum number of request to 1 as we
+	 * only need one connection, more would be an error */
+	if ((scif_listen(init_epd, 1)) < 0)
+		STARPU_MIC_COMMON_REPORT_SCIF_ERROR(errno);
+
+	if ((scif_accept(init_epd, &portID, endpoint, SCIF_ACCEPT_SYNC)) < 0)
+		STARPU_MIC_COMMON_REPORT_SCIF_ERROR(errno);
+
+	scif_close(init_epd);
+}

+ 69 - 0
src/drivers/mic/driver_mic_common.h

@@ -0,0 +1,69 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012  Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+
+#ifndef __DRIVER_MIC_COMMON_H__
+#define __DRIVER_MIC_COMMON_H__
+
+
+#include <common/config.h>
+
+
+#ifdef STARPU_USE_MIC
+
+#define STARPU_TO_MIC_ID(id) ((id) + 1)
+
+#define STARPU_MIC_PORTS_BEGIN 1099
+
+#define STARPU_MIC_SOURCE_PORT_NUMBER STARPU_MIC_PORTS_BEGIN
+#define STARPU_MIC_SINK_PORT_NUMBER(id) ((id) + STARPU_MIC_PORTS_BEGIN + 1)
+
+#define STARPU_MIC_SOURCE_DT_PORT_NUMBER (STARPU_MAXMICDEVS + STARPU_MIC_PORTS_BEGIN + 1)
+#define STARPU_MIC_SINK_DT_PORT_NUMBER(id) ((id) + STARPU_MAXMICDEVS + STARPU_MIC_PORTS_BEGIN + 2)
+
+#define STARPU_MIC_SINK_SINK_DT_PORT_NUMBER(me, peer_id) \
+((me) * STARPU_MAXMICDEVS + (peer_id) +  2 * STARPU_MAXMICDEVS + STARPU_MIC_PORTS_BEGIN + 2)
+
+#define STARPU_MIC_PAGE_SIZE 0x1000
+#define STARPU_MIC_GET_PAGE_SIZE_MULTIPLE(size) \
+(((size) % STARPU_MIC_PAGE_SIZE == 0) ? (size) : (((size) / STARPU_MIC_PAGE_SIZE + 1) * STARPU_MIC_PAGE_SIZE))
+
+#define STARPU_MIC_COMMON_REPORT_SCIF_ERROR(status) \
+	_starpu_mic_common_report_scif_error(__starpu_func__, __FILE__, __LINE__, status)
+
+struct _starpu_mic_free_command
+{
+	void *addr;
+	size_t size;
+};
+
+void _starpu_mic_common_report_scif_error(const char *func, const char *file, int line, const int status);
+
+void _starpu_mic_common_send(const struct _starpu_mp_node *node, void *msg, int len);
+
+void _starpu_mic_common_recv(const struct _starpu_mp_node *node, void *msg, int len);
+
+void _starpu_mic_common_dt_send(const struct _starpu_mp_node *node, void *msg, int len);
+
+void _starpu_mic_common_dt_recv(const struct _starpu_mp_node *node, void *msg, int len);
+
+void _starpu_mic_common_connect(scif_epd_t *endpoint, uint16_t remote_node, 
+				uint16_t local_port_number, uint16_t remote_port_number);
+void _starpu_mic_common_accept(scif_epd_t *endpoint, uint16_t port_number);
+
+#endif /* STARPU_USE_MIC */
+
+#endif /* __DRIVER_MIC_COMMON_H__ */

+ 135 - 0
src/drivers/mic/driver_mic_sink.c

@@ -0,0 +1,135 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012  Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+
+#include <errno.h>
+
+#include <common/COISysInfo_common.h>
+
+#include <starpu.h>
+#include <drivers/mp_common/mp_common.h>
+#include <drivers/mp_common/sink_common.h>
+
+#include "driver_mic_common.h"
+#include "driver_mic_sink.h"
+
+/* Initialize the MIC sink, initializing connection to the source
+ * and to the other devices (not implemented yet).
+ */
+
+void _starpu_mic_sink_init(struct _starpu_mp_node *node)
+{
+	//unsigned int i;
+	
+	/* Initialize connection with the source */
+	_starpu_mic_common_accept(&node->mp_connection.mic_endpoint,
+					 STARPU_MIC_SOURCE_PORT_NUMBER);
+
+	_starpu_mic_common_accept(&node->host_sink_dt_connection.mic_endpoint,
+									 STARPU_MIC_SOURCE_DT_PORT_NUMBER);
+
+	//node->sink_sink_dt_connections = malloc(node->nb_mp_sinks * sizeof(union _starpu_mp_connection));
+
+	//for (i = 0; i < (unsigned int)node->devid; ++i)
+	//	_starpu_mic_common_connect(&node->sink_sink_dt_connections[i].mic_endpoint,
+	//								STARPU_TO_MIC_ID(i),
+	//								STARPU_MIC_SINK_SINK_DT_PORT_NUMBER(node->devid, i),	
+	//								STARPU_MIC_SINK_SINK_DT_PORT_NUMBER(i, node->devid));
+
+	//for (i = node->devid + 1; i < node->nb_mp_sinks; ++i)
+	//	_starpu_mic_common_accept(&node->sink_sink_dt_connections[i].mic_endpoint,
+	//								STARPU_MIC_SINK_SINK_DT_PORT_NUMBER(node->devid, i));
+}
+
+/* Deinitialize the MIC sink, close all the connections.
+ */
+
+void _starpu_mic_sink_deinit(struct _starpu_mp_node *node)
+{
+	//unsigned int i;
+
+	//for (i = 0; i < node->nb_mp_sinks; ++i)
+	//{
+	//	if (i != (unsigned int)node->devid)
+	//		scif_close(node->sink_sink_dt_connections[i].mic_endpoint);
+	//}
+
+	//free(node->sink_sink_dt_connections);
+
+	scif_close(node->host_sink_dt_connection.mic_endpoint);
+	scif_close(node->mp_connection.mic_endpoint);
+}
+
+/* Report an error which occured when using a MIC device
+ * and print this error in a human-readable style
+ */
+
+void _starpu_mic_sink_report_error(const char *func, const char *file, const int line, const int status)
+{
+	const char *errormsg = strerror(status);
+	printf("SINK: oops in %s (%s:%u)... %d: %s \n", func, file, line, status, errormsg);
+	STARPU_ASSERT(0);
+}
+
+/* Return the number of cores on the callee, a MIC device or Processor Xeon
+ */
+unsigned int _starpu_mic_sink_get_nb_core(void)
+{
+	return (unsigned int) COISysGetCoreCount();
+}
+
+/* Allocate memory on the MIC.
+ * Memory is register for remote direct access. */
+void _starpu_mic_sink_allocate(const struct _starpu_mp_node *mp_node, void *arg, int arg_size)
+{
+	STARPU_ASSERT(arg_size == sizeof(size_t));
+
+	void *addr = NULL;
+	size_t size = *(size_t *)(arg);
+	
+	if (posix_memalign(&addr, STARPU_MIC_PAGE_SIZE, size) != 0)
+		_starpu_mp_common_send_command(mp_node, STARPU_ERROR_ALLOCATE, NULL, 0);
+
+#ifdef STARPU_MIC_USE_RMA
+	scif_epd_t epd = mp_node->host_sink_dt_connection.mic_endpoint;
+	size_t window_size = STARPU_MIC_GET_PAGE_SIZE_MULTIPLE(size);
+
+	if (scif_register(epd, addr, window_size, (off_t)addr, SCIF_PROT_READ | SCIF_PROT_WRITE, SCIF_MAP_FIXED) < 0)
+	{
+		free(addr);
+		_starpu_mp_common_send_command(mp_node, STARPU_ERROR_ALLOCATE, NULL, 0);
+	}
+#endif
+	
+	_starpu_mp_common_send_command(mp_node, STARPU_ANSWER_ALLOCATE, &addr, sizeof(addr));
+}
+
+/* Unregister and free memory. */
+void _starpu_mic_sink_free(const struct _starpu_mp_node *mp_node STARPU_ATTRIBUTE_UNUSED, void *arg, int arg_size)
+{
+	STARPU_ASSERT(arg_size == sizeof(struct _starpu_mic_free_command));
+
+	void *addr = ((struct _starpu_mic_free_command *)arg)->addr;
+	
+#ifdef STARPU_MIC_USE_RMA
+	scif_epd_t epd = mp_node->host_sink_dt_connection.mic_endpoint;
+	size_t size = ((struct _starpu_mic_free_command *)arg)->size;
+	size_t window_size = STARPU_MIC_GET_PAGE_SIZE_MULTIPLE(size);
+
+	scif_unregister(epd, (off_t)addr, window_size);
+#endif
+	free(addr);
+}

+ 48 - 0
src/drivers/mic/driver_mic_sink.h

@@ -0,0 +1,48 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012  Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __DRIVER_MIC_SINK_H__
+#define __DRIVER_MIC_SINK_H__
+
+#include <common/config.h>
+
+#ifdef STARPU_USE_MIC
+
+#include <scif.h>
+
+#include <drivers/mp_common/mp_common.h>
+#include <drivers/mp_common/sink_common.h>
+
+
+#define STARPU_MIC_SINK_REPORT_ERROR(status) \
+	_starpu_mic_sink_report_error(__starpu_func__, __FILE__, __LINE__, status)
+
+
+void _starpu_mic_sink_report_error(const char *func, const char *file, const int line, const int status);
+
+void _starpu_mic_sink_init(struct _starpu_mp_node *node);
+
+void _starpu_mic_sink_deinit(struct _starpu_mp_node *node);
+
+unsigned int _starpu_mic_sink_get_nb_core(void);
+
+void _starpu_mic_sink_allocate(const struct _starpu_mp_node *mp_node, void *arg, int arg_size);
+void _starpu_mic_sink_free(const struct _starpu_mp_node *mp_node STARPU_ATTRIBUTE_UNUSED, void *arg, int arg_size);
+
+#endif /* STARPU_USE_MIC */
+
+
+#endif /* __DRIVER_MIC_SINK_H__ */

+ 749 - 0
src/drivers/mic/driver_mic_source.c

@@ -0,0 +1,749 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012  Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <stdio.h>
+
+#include <scif.h>
+
+#include <starpu.h>
+#include <starpu_profiling.h>
+#include <core/sched_policy.h>
+
+#include <drivers/driver_common/driver_common.h>
+#include <drivers/mp_common/source_common.h>
+
+#include "driver_mic_common.h"
+#include "driver_mic_source.h"
+
+
+/* Array of structures containing all the informations useful to send
+ * and receive informations with devices */
+struct _starpu_mp_node *mic_nodes[STARPU_MAXMICDEVS];
+
+static COIENGINE handles[STARPU_MAXMICDEVS];
+/* static COIPROCESS process[STARPU_MAXMICDEVS]; */
+
+/* Structure used by host to store informations about a kernel executable on
+ * a MIC device : its name, and its address on each device.
+ * If a kernel has been initialized, then a lookup has already been achieved and the
+ * device knows how to call it, else the host still needs to do a lookup.
+ */
+struct _starpu_mic_kernel
+{
+	char *name;
+	starpu_mic_kernel_t func[STARPU_MAXMICDEVS];
+};
+
+/* Hash table use to store _starpu_mic_kernel
+ */
+static struct _starpu_htbl kernels_htbl;
+
+/* Mutex for concurrent access to the table.
+ */
+starpu_pthread_mutex_t htbl_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+/* Number of MIC worker initialized.
+ */
+unsigned int nb_mic_worker_init = 0;
+starpu_pthread_mutex_t nb_mic_worker_init_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+/* Returns the ID of the MIC device controlled by the caller.
+ * if the worker doesn't control a MIC device -ENODEV is returned
+ */
+
+//static int _starpu_mic_get_devid(void)
+//{
+//	struct _starpu_machine_config *config = _starpu_get_machine_config();
+//	int workerid = starpu_worker_get_id();
+//
+//	if (config->workers[workerid].arch != STARPU_MIC_WORKER)
+//		return -ENODEV;
+//
+//	return config->workers[workerid].devid;
+//}
+
+const struct _starpu_mp_node *_starpu_mic_src_get_actual_thread_mp_node()
+{
+	struct _starpu_worker *actual_worker = _starpu_get_local_worker_key();
+	STARPU_ASSERT(actual_worker);
+
+	int nodeid = actual_worker->mp_nodeid;
+	STARPU_ASSERT(nodeid >= 0 && nodeid < STARPU_MAXMICDEVS);
+
+	return mic_nodes[nodeid];
+}
+
+const struct _starpu_mp_node *_starpu_mic_src_get_mp_node_from_memory_node(int memory_node)
+{
+	int nodeid = _starpu_memory_node_to_devid(memory_node);
+	STARPU_ASSERT(nodeid >= 0 && nodeid < STARPU_MAXMICDEVS);
+
+	return mic_nodes[nodeid];
+}
+
+// Should be obsolete.
+/* static void _starpu_mic_src_init_context(int devid,
+ * 					 struct starpu_conf *user_conf)
+ * {
+ * 	COIRESULT res;
+ * 	char mic_sink_program_path[1024];
+ *
+ * 	char ***argv = _starpu_get_argv();
+ * 	const char *suffixes[] = {"-mic", "_mic", NULL};
+ *
+ * 	char devid_env[32];
+ * 	sprintf(devid_env, "DEVID=%d", devid);
+ *
+ * 	char nb_mic_env[32];
+ * 	sprintf(nb_mic_env, "NB_MIC=%d", starpu_mic_worker_get_count());
+ *
+ * 	/\* Environment variables to send to the Sink, it informs it what kind
+ * 	 * of node it is (architecture and type) as there is no way to discover
+ * 	 * it itself *\/
+ * 	const char *mic_sink_env[] = {"STARPU_SINK=STARPU_MIC", devid_env, nb_mic_env, NULL};
+ *
+ * 	/\* Let's get the helper program to run on the MIC device *\/
+ * 	int mic_file_found = _starpu_src_common_locate_file(mic_sink_program_path,
+ * 							getenv("STARPU_MIC_SINK_PROGRAM_NAME"),
+ * 							getenv("STARPU_MIC_SINK_PROGRAM_PATH"),
+ * 							(user_conf == NULL ? NULL : user_conf->mic_sink_program_path),
+ * 							(argv ? (*argv)[0] : NULL),
+ * 							suffixes);
+ *
+ * 	STARPU_ASSERT(mic_file_found == 0);
+ *
+ * 	/\* Let's get the handle which let us manage the remote MIC device *\/
+ * 	res = COIEngineGetHandle(COI_ISA_MIC, devid, &handles[devid]);
+ * 	if (STARPU_UNLIKELY(res != COI_SUCCESS))
+ * 		STARPU_MIC_SRC_REPORT_COI_ERROR(res);
+ *
+ * 	/\* We launch the helper on the MIC device, which will wait for us
+ * 	 * to give it work to do.
+ * 	 * As we will communicate further with the device throught scif we
+ * 	 * don't need to keep the process pointer *\/
+ * 	res = COIProcessCreateFromFile(handles[devid], mic_sink_program_path, 0, NULL, 0,
+ * 				       mic_sink_env, 1, NULL, 0, NULL,
+ * 				       &process[devid]);
+ * 	if (STARPU_UNLIKELY(res != COI_SUCCESS))
+ * 		STARPU_MIC_SRC_REPORT_COI_ERROR(res);
+ *
+ * 	/\* Let's create the node structure, we'll communicate with the peer
+ * 	 * through scif thanks to it *\/
+ * 	mic_nodes[devid] = _starpu_mp_common_node_create(STARPU_MIC_SOURCE,
+ * 							   devid);
+ *
+ *
+ * 	// XXX: this is not replicated in `_starpu_init_mic_node'.
+ * 	STARPU_PTHREAD_MUTEX_LOCK(&nb_mic_worker_init_mutex);
+ * 	++nb_mic_worker_init;
+ * 	STARPU_PTHREAD_MUTEX_UNLOCK(&nb_mic_worker_init_mutex);
+ * } */
+
+/* static void _starpu_mic_src_free_kernel(void *kernel)
+ * {
+ * 	struct _starpu_mic_kernel *k = kernel;
+ *
+ * 	free(k->name);
+ * 	free(kernel);
+ * } */
+
+/* static void _starpu_mic_src_deinit_context(int devid)
+ * {
+ * 	_starpu_mp_common_send_command(mic_nodes[devid], STARPU_EXIT, NULL, 0);
+ *
+ * 	COIProcessDestroy(process[devid], -1, 0, NULL, NULL);
+ *
+ * 	_starpu_mp_common_node_destroy(mic_nodes[devid]);
+ *
+ * 	STARPU_PTHREAD_MUTEX_LOCK(&nb_mic_worker_init_mutex);
+ * 	unsigned int tmp = --nb_mic_worker_init;
+ * 	STARPU_PTHREAD_MUTEX_UNLOCK(&nb_mic_worker_init_mutex);
+ *
+ * 	if (tmp == 0)
+ * 		_starpu_htbl_destroy(&kernels_htbl, _starpu_mic_src_free_kernel);
+ * } */
+
+static int
+_starpu_mic_src_finalize_job (struct _starpu_job *j, struct _starpu_worker *worker)
+{
+    uint32_t mask = 0;
+    int profiling = starpu_profiling_status_get();
+    struct timespec codelet_end;
+
+    _starpu_driver_end_job(worker, j, worker->perf_arch, &codelet_end, 0,
+			   profiling);
+
+    _starpu_driver_update_job_feedback(j, worker, worker->perf_arch,
+				       &j->cl_start, &codelet_end,
+				       profiling);
+
+    _starpu_push_task_output (j, mask);
+
+    _starpu_handle_job_termination(j);
+
+    return 0;
+}
+
+static int
+_starpu_mic_src_process_completed_job (struct _starpu_worker_set *workerset)
+{
+    struct _starpu_mp_node *node = mic_nodes[workerset->workers[0].mp_nodeid];
+    enum _starpu_mp_command answer;
+    void *arg;
+    int arg_size;
+
+    answer = _starpu_mp_common_recv_command (node, &arg, &arg_size);
+    STARPU_ASSERT (answer == STARPU_EXECUTION_COMPLETED);
+
+    void *arg_ptr = arg;
+    int coreid;
+
+    coreid = *(int *) arg_ptr;
+    arg_ptr += sizeof (coreid); // Useless.
+
+    struct _starpu_worker *worker = &workerset->workers[coreid];
+    struct starpu_task *task = worker->current_task;
+    struct _starpu_job *j = _starpu_get_job_associated_to_task (task);
+
+    _starpu_mic_src_finalize_job (j, worker);
+
+    worker->current_task = NULL;
+
+    return 0;
+}
+
+
+static int _starpu_mic_src_execute_job(struct _starpu_job *j, struct _starpu_worker *args)
+{
+	int ret;
+	uint32_t mask = 0;
+
+	STARPU_ASSERT(j);
+	struct starpu_task *task = j->task;
+
+	//struct timespec codelet_end;
+
+	int profiling = starpu_profiling_status_get();
+	unsigned calibrate_model = 0;
+
+	STARPU_ASSERT(task);
+	struct starpu_codelet *cl = task->cl;
+	STARPU_ASSERT(cl);
+
+	if (cl->model && cl->model->benchmarking)
+		calibrate_model = 1;
+
+	ret = _starpu_fetch_task_input(j, mask);
+	if (ret != 0)
+	{
+		/* there was not enough memory, so the input of
+		 * the codelet cannot be fetched ... put the
+		 * codelet back, and try it later */
+		return -EAGAIN;
+	}
+
+
+	starpu_mic_kernel_t kernel = _starpu_mic_src_get_kernel_from_codelet(j->task->cl, j->nimpl);
+
+	_starpu_driver_start_job (args, j, &j->cl_start, 0, profiling);
+
+	_starpu_src_common_execute_kernel_from_task(mic_nodes[args->mp_nodeid],
+						    (void (*)(void)) kernel, args->devid, task);
+
+	return 0;
+}
+
+int _starpu_mic_src_register_kernel(starpu_mic_func_symbol_t *symbol, const char *func_name)
+{
+	unsigned int func_name_size = (strlen(func_name) + 1) * sizeof(char);
+
+	STARPU_PTHREAD_MUTEX_LOCK(&htbl_mutex);
+	struct _starpu_mic_kernel *kernel = _starpu_htbl_search(&kernels_htbl, func_name, func_name_size);
+
+	if (kernel != NULL)
+	{
+		STARPU_PTHREAD_MUTEX_UNLOCK(&htbl_mutex);
+		// Function already in the table.
+		*symbol = kernel;
+		return 0;
+	}
+
+	kernel = malloc(sizeof(*kernel));
+	if (kernel == NULL)
+	{
+		STARPU_PTHREAD_MUTEX_UNLOCK(&htbl_mutex);
+		return -ENOMEM;
+	}
+
+	kernel->name = malloc(func_name_size);
+	if (kernel->name == NULL)
+	{
+		STARPU_PTHREAD_MUTEX_UNLOCK(&htbl_mutex);
+		free(kernel);
+		return -ENOMEM;
+	}
+
+	int ret = _starpu_htbl_insert(&kernels_htbl, func_name, func_name_size, kernel);
+	STARPU_PTHREAD_MUTEX_UNLOCK(&htbl_mutex);
+	if (ret != 0)
+	{
+		free(kernel->name);
+		free(kernel);
+		return -ENOMEM;
+	}
+
+	memcpy(kernel->name, func_name, func_name_size);
+
+	unsigned int nb_mic_devices = _starpu_mic_src_get_device_count();
+	unsigned int i;
+	for (i = 0; i < nb_mic_devices; ++i)
+		kernel->func[i] = NULL;
+
+	*symbol = kernel;
+
+	return 0;
+}
+
+starpu_mic_kernel_t _starpu_mic_src_get_kernel(starpu_mic_func_symbol_t symbol)
+{
+	int workerid = starpu_worker_get_id();
+	/* This function has to be called in the codelet only, by the thread
+	 * which will handle the task */
+	if (workerid < 0)
+		return NULL;
+
+	int nodeid = starpu_worker_get_mp_nodeid(workerid);
+
+	struct _starpu_mic_kernel *kernel = symbol;
+
+	if (kernel->func[nodeid] == NULL)
+	{
+		struct _starpu_mp_node *node = mic_nodes[nodeid];
+		int ret = _starpu_src_common_lookup(node, (void (**)(void))&kernel->func[nodeid], kernel->name);
+		if (ret)
+			return NULL;
+	}
+
+	return kernel->func[nodeid];
+}
+
+/* Report an error which occured when using a MIC device
+ * and print this error in a human-readable style.
+ * It hanbles errors occuring when using COI.
+ */
+
+void _starpu_mic_src_report_coi_error(const char *func, const char *file,
+				      const int line, const COIRESULT status)
+{
+	const char *errormsg = COIResultGetName(status);
+	printf("SRC: oops in %s (%s:%u)... %d: %s \n", func, file, line, status, errormsg);
+	STARPU_ASSERT(0);
+}
+
+/* Report an error which occured when using a MIC device
+ * and print this error in a human-readable style.
+ * It hanbles errors occuring when using SCIF.
+ */
+
+void _starpu_mic_src_report_scif_error(const char *func, const char *file, const int line, const int status)
+{
+	const char *errormsg = strerror(status);
+	printf("SRC: oops in %s (%s:%u)... %d: %s \n", func, file, line, status, errormsg);
+	STARPU_ASSERT(0);
+}
+
+/* Return the number of MIC devices in the system.
+ * If the number of devices is already known, we use the cached value
+ * without calling again COI. */
+
+unsigned _starpu_mic_src_get_device_count(void)
+{
+	static unsigned short cached = 0;
+	static unsigned nb_devices = 0;
+
+	/* We don't need to call the COI API again if we already
+	 * have the result in cache */
+	if (!cached)
+	{
+		COIRESULT res;
+		res = COIEngineGetCount(COI_ISA_MIC, &nb_devices);
+
+		/* If something is wrong with the COI engine, we shouldn't
+		 * use MIC devices (if there is any...) */
+		if (res != COI_SUCCESS)
+			nb_devices = 0;
+
+		cached = 1;
+	}
+
+	return nb_devices;
+}
+
+unsigned starpu_mic_device_get_count(void)
+{
+    // Return the number of configured MIC devices.
+    struct _starpu_machine_config *config = _starpu_get_machine_config ();
+    struct starpu_machine_topology *topology = &config->topology;
+
+    return topology->nmicdevices;
+}
+
+starpu_mic_kernel_t _starpu_mic_src_get_kernel_from_codelet(struct starpu_codelet *cl, unsigned nimpl)
+{
+	starpu_mic_kernel_t kernel = NULL;
+
+	starpu_mic_func_t func = _starpu_task_get_mic_nth_implementation(cl, nimpl);
+	if (func)
+	{
+		/* We execute the function contained in the codelet, it must return a
+		 * pointer to the function to execute on the device, either specified
+		 * directly by the user or by a call to starpu_mic_get_func().
+		 */
+		kernel = func();
+	}
+	else
+	{
+		/* If user dont define any starpu_mic_fun_t in cl->mic_func we try to use
+		 * cpu_func_name.
+		 */
+		char *func_name = _starpu_task_get_cpu_name_nth_implementation(cl, nimpl);
+		if (func_name)
+		{
+			starpu_mic_func_symbol_t symbol;
+
+			_starpu_mic_src_register_kernel(&symbol, func_name);
+
+			kernel = _starpu_mic_src_get_kernel(symbol);
+		}
+	}
+	STARPU_ASSERT(kernel);
+
+	return kernel;
+}
+
+/* Initialize the node structure describing the MIC source.
+ */
+void _starpu_mic_src_init(struct _starpu_mp_node *node)
+{
+    /* Let's initialize the connection with the peered sink device */
+    _starpu_mic_common_connect(&node->mp_connection.mic_endpoint,
+					STARPU_TO_MIC_ID(node->peer_id),
+					STARPU_MIC_SINK_PORT_NUMBER(node->peer_id),
+					STARPU_MIC_SOURCE_PORT_NUMBER);
+
+    _starpu_mic_common_connect(&node->host_sink_dt_connection.mic_endpoint,
+			       STARPU_TO_MIC_ID(node->peer_id),
+			       STARPU_MIC_SINK_DT_PORT_NUMBER(node->peer_id),
+			       STARPU_MIC_SOURCE_DT_PORT_NUMBER);
+}
+
+/* Deinitialize the MIC sink, close all the connections.
+ */
+void _starpu_mic_src_deinit(struct _starpu_mp_node *node)
+{
+	scif_close(node->host_sink_dt_connection.mic_endpoint);
+	scif_close(node->mp_connection.mic_endpoint);
+}
+
+/* Get infos of the MIC associed to memory_node */
+static void _starpu_mic_get_engine_info(COI_ENGINE_INFO *info, int devid)
+{
+	STARPU_ASSERT(devid >= 0 && devid < STARPU_MAXMICDEVS);
+
+	if (COIEngineGetInfo(handles[devid], sizeof(*info), info) != COI_SUCCESS)
+		STARPU_MIC_SRC_REPORT_COI_ERROR(errno);
+}
+
+/* TODO: call _starpu_memory_manager_set_global_memory_size instead */
+/* Return the size of the memory on the MIC associed to memory_node */
+size_t _starpu_mic_get_global_mem_size(int devid)
+{
+	COI_ENGINE_INFO infos;
+	_starpu_mic_get_engine_info(&infos, devid);
+
+	return infos.PhysicalMemory;
+}
+
+/* Return the size of the free memory on the MIC associed to memory_node */
+size_t _starpu_mic_get_free_mem_size(int devid)
+{
+	COI_ENGINE_INFO infos;
+	_starpu_mic_get_engine_info(&infos, devid);
+
+	return infos.PhysicalMemoryFree;
+}
+
+/* Allocate memory on MIC.
+ * Return 0 if OK or 1 if not.
+ */
+int _starpu_mic_allocate_memory(void **addr, size_t size, unsigned memory_node)
+{
+	/* We check we have (1.25 * size) free space in the MIC because
+	 * transfert with scif is not possible when the MIC
+	 * doesn't have enought free memory.
+	 * In this cas we can't tell any things to the host. */
+	//int devid = _starpu_memory_node_to_devid(memory_node);
+	//if (_starpu_mic_get_free_mem_size(devid) < size * 1.25)
+	//	return 1;
+
+	const struct _starpu_mp_node *mp_node = _starpu_mic_src_get_mp_node_from_memory_node(memory_node);
+
+	return _starpu_src_common_allocate(mp_node, addr, size);
+}
+
+/* Free memory on MIC.
+ * Mic need size to free memory for use the function scif_unregister.
+ */
+void _starpu_mic_free_memory(void *addr, size_t size, unsigned memory_node)
+{
+	const struct _starpu_mp_node *mp_node = _starpu_mic_src_get_mp_node_from_memory_node(memory_node);
+	struct _starpu_mic_free_command cmd = {addr, size};
+
+	return _starpu_mp_common_send_command(mp_node, STARPU_FREE, &cmd, sizeof(cmd));
+}
+
+/* Transfert SIZE bytes from the address pointed by SRC in the SRC_NODE memory
+ * node to the address pointed by DST in the DST_NODE memory node
+ */
+int _starpu_mic_copy_ram_to_mic(void *src, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst, unsigned dst_node, size_t size)
+{
+	const struct _starpu_mp_node *mp_node = _starpu_mic_src_get_mp_node_from_memory_node(dst_node);
+
+	return _starpu_src_common_copy_host_to_sink(mp_node, src, dst, size);
+}
+
+/* Transfert SIZE bytes from the address pointed by SRC in the SRC_NODE memory
+ * node to the address pointed by DST in the DST_NODE memory node
+ */
+int _starpu_mic_copy_mic_to_ram(void *src, unsigned src_node, void *dst, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, size_t size)
+{
+	const struct _starpu_mp_node *mp_node = _starpu_mic_src_get_mp_node_from_memory_node(src_node);
+
+	return _starpu_src_common_copy_sink_to_host(mp_node, src, dst, size);
+}
+
+/* Asynchronous transfers */
+int _starpu_mic_copy_ram_to_mic_async(void *src, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst, unsigned dst_node, size_t size)
+{
+	const struct _starpu_mp_node *mp_node = _starpu_mic_src_get_mp_node_from_memory_node(dst_node);
+
+	if (scif_vwriteto(mp_node->host_sink_dt_connection.mic_endpoint, src, size, (off_t)dst, 0) < 0)
+		STARPU_MIC_SRC_REPORT_SCIF_ERROR(errno);
+
+	return 0;
+}
+
+int _starpu_mic_copy_mic_to_ram_async(void *src, unsigned src_node, void *dst, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, size_t size)
+{
+	const struct _starpu_mp_node *mp_node = _starpu_mic_src_get_mp_node_from_memory_node(src_node);
+
+	if (scif_vreadfrom(mp_node->host_sink_dt_connection.mic_endpoint, dst, size, (off_t)src, 0) < 0)
+		STARPU_MIC_SRC_REPORT_SCIF_ERROR(errno);
+
+	return 0;
+}
+
+/* Initialize a _starpu_mic_async_event. */
+int _starpu_mic_init_event(struct _starpu_mic_async_event *event, unsigned memory_node)
+{
+	const struct _starpu_mp_node *mp_node = _starpu_mic_src_get_mp_node_from_memory_node(memory_node);
+	scif_epd_t epd = mp_node->host_sink_dt_connection.mic_endpoint;
+
+	event->memory_node = memory_node;
+
+	/* Address of allocation must be multiple of the page size. */
+	if (posix_memalign((void **)&(event->signal), 0x1000, sizeof(*(event->signal))) != 0)
+		return -ENOMEM;
+	*(event->signal) = 0;
+
+	/* The size pass to scif_register is 0x1000 because it should be a multiple of the page size. */
+	if (scif_register(epd, event->signal, 0x1000, (off_t)(event->signal), SCIF_PROT_WRITE, SCIF_MAP_FIXED) < 0)
+		STARPU_MIC_SRC_REPORT_SCIF_ERROR(errno);
+
+	/* Mark for a futur wait. */
+	if (scif_fence_mark(epd, SCIF_FENCE_INIT_SELF, &(event->mark)) < 0)
+		STARPU_MIC_SRC_REPORT_SCIF_ERROR(errno);
+
+	/* Tell to scif to write STARPU_MIC_REQUEST_COMPLETE in event->signal when the transfer is complete.
+	 * We use this for test the end of a transfer. */
+	if (scif_fence_signal(epd, (off_t)event->signal, STARPU_MIC_REQUEST_COMPLETE, 0, 0, SCIF_FENCE_INIT_SELF | SCIF_SIGNAL_LOCAL) < 0)
+		STARPU_MIC_SRC_REPORT_SCIF_ERROR(errno);
+
+	return 0;
+}
+
+/* Wait the end of the asynchronous request */
+void _starpu_mic_wait_request_completion(struct _starpu_mic_async_event *event)
+{
+	if (event->signal != NULL)
+	{
+		const struct _starpu_mp_node *mp_node = _starpu_mic_src_get_mp_node_from_memory_node(event->memory_node);
+		scif_epd_t epd = mp_node->host_sink_dt_connection.mic_endpoint;
+
+		if (scif_fence_wait(epd, event->mark) < 0)
+			STARPU_MIC_SRC_REPORT_SCIF_ERROR(errno);
+
+		if (scif_unregister(epd, (off_t)(event->signal), 0x1000) < 0)
+			STARPU_MIC_SRC_REPORT_SCIF_ERROR(errno);
+
+		free(event->signal);
+		event->signal = NULL;
+	}
+}
+
+/* Test if a asynchronous request is end.
+ * Return 1 if is end, 0 else. */
+int _starpu_mic_request_is_complete(struct _starpu_mic_async_event *event)
+{
+	if (event->signal != NULL && *(event->signal) != STARPU_MIC_REQUEST_COMPLETE)
+		return 0;
+
+	const struct _starpu_mp_node *mp_node = _starpu_mic_src_get_mp_node_from_memory_node(event->memory_node);
+	scif_epd_t epd = mp_node->host_sink_dt_connection.mic_endpoint;
+
+	if (scif_unregister(epd, (off_t)(event->signal), 0x1000) < 0)
+		STARPU_MIC_SRC_REPORT_SCIF_ERROR(errno);
+
+	free(event->signal);
+	event->signal = NULL;
+	return 1;
+}
+
+void *_starpu_mic_src_worker(void *arg)
+{
+	struct _starpu_worker_set *args = arg;
+	/* As all workers of a set share common data, we just use the first
+	 * one for intializing the following stuffs. */
+	struct _starpu_worker *baseworker = &args->workers[0];
+	struct _starpu_machine_config *config = baseworker->config;
+	unsigned baseworkerid = baseworker - config->workers;
+
+	unsigned memnode = baseworker->memory_node;
+
+	_starpu_worker_init(baseworker, _STARPU_FUT_MIC_KEY);
+
+	// Current task for a thread managing a worker set has no sense.
+	_starpu_set_current_task(NULL);
+
+	baseworker->status = STATUS_UNKNOWN;
+
+	_STARPU_TRACE_WORKER_INIT_END
+
+	/* tell the main thread that this one is ready */
+	_STARPU_STARPU_PTHREAD_MUTEX_LOCK(&args->mutex);
+	args->set_is_initialized = 1;
+	_STARPU_PTHREAD_COND_SIGNAL(&args->ready_cond);
+	_STARPU_STARPU_PTHREAD_MUTEX_UNLOCK(&args->mutex);
+
+
+	while (_starpu_machine_is_running())
+	{
+		int res;
+		struct starpu_task *task = NULL;
+		struct _starpu_job * j;
+		unsigned micworkerid = 0;
+
+		_STARPU_TRACE_START_PROGRESS(memnode);
+		_starpu_datawizard_progress(memnode, 1);
+		_STARPU_TRACE_END_PROGRESS(memnode);
+
+		_STARPU_STARPU_PTHREAD_MUTEX_LOCK(baseworker->sched_mutex);
+
+		/* We pop tasklists of each worker in the set and process the
+		 * first non-empty list. */
+		for (micworkerid = 0 ; (micworkerid < args->nworkers) && (task == NULL); micworkerid++)
+		    task = _starpu_pop_task (&args->workers[micworkerid]);
+
+		if (task != NULL)
+		    goto task_found;
+
+		/* No task to submit, so we can poll the MIC device for
+		 * completed jobs. */
+		struct pollfd fd = {
+		    .fd = mic_nodes[baseworker->mp_nodeid]->mp_connection.mic_endpoint,
+		    .events = POLLIN
+		};
+
+		if (0 < poll (&fd, 1, 0)) {
+		    _starpu_mic_src_process_completed_job (args);
+		    goto restart_loop;
+		}
+
+		/* At this point, there is really nothing to do for the thread
+		 * so we can block.
+		 * XXX: blocking drivers is in fact broken. DO NOT USE IT ! */
+		if (_starpu_worker_can_block(memnode))
+		    _starpu_block_worker(baseworkerid, baseworker->sched_cond, baseworker->sched_mutex);
+
+	restart_loop:
+		_STARPU_STARPU_PTHREAD_MUTEX_UNLOCK(baseworker->sched_mutex);
+		continue;
+
+	task_found:
+		/* If the MIC core associated to `micworkerid' is already
+		 * processing a job, we push back this one in the worker task
+		 * list. */
+		_STARPU_STARPU_PTHREAD_MUTEX_UNLOCK(baseworker->sched_mutex);
+
+		if (args->workers[micworkerid].current_task) {
+		    _starpu_push_task_to_workers(j);
+		    continue;
+		}
+
+		STARPU_ASSERT(task);
+		j = _starpu_get_job_associated_to_task(task);
+
+		/* can a MIC device do that task ? */
+		if (!_STARPU_MIC_MAY_PERFORM(j))
+		{
+			/* this isn't a mic task */
+			_starpu_push_task_to_workers(j);
+			continue;
+		}
+
+		args->workers[micworkerid].current_task = j->task;
+
+		res = _starpu_mic_src_execute_job (j, &args->workers[micworkerid]);
+
+		if (res)
+		{
+			switch (res)
+			{
+				case -EAGAIN:
+					_STARPU_DISP("ouch, put the codelet %p back ... \n", j);
+					_starpu_push_task(j);
+					STARPU_ABORT();
+					continue;
+				default:
+					STARPU_ASSERT(0);
+			}
+		}
+	}
+
+	_STARPU_TRACE_WORKER_DEINIT_START
+
+	_starpu_handle_all_pending_node_data_requests(memnode);
+
+	/* In case there remains some memory that was automatically
+	 * allocated by StarPU, we release it now. Note that data
+	 * coherency is not maintained anymore at that point ! */
+	_starpu_free_all_automatically_allocated_buffers(memnode);
+
+	_STARPU_TRACE_WORKER_DEINIT_END(_STARPU_FUT_CUDA_KEY);
+
+	return NULL;
+
+}

+ 79 - 0
src/drivers/mic/driver_mic_source.h

@@ -0,0 +1,79 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012  Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __DRIVER_MIC_SOURCE_H__
+#define __DRIVER_MIC_SOURCE_H__
+
+#include <starpu_mic.h>
+#include <common/config.h>
+
+#ifdef STARPU_USE_MIC
+
+#include <source/COIProcess_source.h>
+#include <source/COIEngine_source.h>
+
+#include <drivers/mp_common/mp_common.h>
+
+/* Array of structures containing all the informations useful to send
+ * and receive informations with devices */
+extern struct _starpu_mp_node *mic_nodes[STARPU_MAXMICDEVS];
+
+struct _starpu_mic_async_event *event;
+
+#define STARPU_MIC_REQUEST_COMPLETE 42
+
+#define STARPU_MIC_SRC_REPORT_COI_ERROR(status) \
+	_starpu_mic_src_report_coi_error(__starpu_func__, __FILE__, __LINE__, status)
+
+#define STARPU_MIC_SRC_REPORT_SCIF_ERROR(status) \
+	_starpu_mic_src_report_scif_error(__starpu_func__, __FILE__, __LINE__, status)
+
+const struct _starpu_mp_node *_starpu_mic_src_get_actual_thread_mp_node();
+const struct _starpu_mp_node *_starpu_mic_src_get_mp_node_from_memory_node(int memory_node);
+
+int _starpu_mic_src_register_kernel(starpu_mic_func_symbol_t *symbol, const char *func_name);
+starpu_mic_kernel_t _starpu_mic_src_get_kernel(starpu_mic_func_symbol_t symbol);
+
+void _starpu_mic_src_report_coi_error(const char *func, const char *file, int line, const COIRESULT status);
+void _starpu_mic_src_report_scif_error(const char *func, const char *file, int line, const int status);
+
+unsigned _starpu_mic_src_get_device_count(void);
+starpu_mic_kernel_t _starpu_mic_src_get_kernel_from_codelet(struct starpu_codelet *cl, unsigned nimpl);
+
+void _starpu_mic_src_init(struct _starpu_mp_node *node);
+void _starpu_mic_src_deinit(struct _starpu_mp_node *node);
+
+size_t _starpu_mic_get_global_mem_size(int devid);
+size_t _starpu_mic_get_free_mem_size(int devid);
+
+int _starpu_mic_allocate_memory(void **addr, size_t size, unsigned memory_node);
+void _starpu_mic_free_memory(void *addr, size_t size, unsigned memory_node);
+
+int _starpu_mic_copy_ram_to_mic(void *src, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst, unsigned dst_node, size_t size);
+int _starpu_mic_copy_mic_to_ram(void *src, unsigned src_node, void *dst, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, size_t size);
+int _starpu_mic_copy_ram_to_mic_async(void *src, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst, unsigned dst_node, size_t size);
+int _starpu_mic_copy_mic_to_ram_async(void *src, unsigned src_node, void *dst, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, size_t size);
+
+int _starpu_mic_init_event(struct _starpu_mic_async_event *event, unsigned memory_node);
+void _starpu_mic_wait_request_completion(struct _starpu_mic_async_event *event);
+int _starpu_mic_request_is_complete(struct _starpu_mic_async_event *event);
+
+void *_starpu_mic_src_worker(void *arg);
+
+#endif /* STARPU_USE_MIC */
+
+
+#endif /* __DRIVER_MIC_SOURCE_H__ */

+ 45 - 0
src/drivers/mic/driver_mic_utils.c

@@ -0,0 +1,45 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012  Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+
+#include <starpu.h>
+#include <starpu_mic.h>
+#include <common/config.h>
+
+#include <drivers/mp_common/source_common.h>
+#include <drivers/mic/driver_mic_source.h>
+
+
+/* Initiate a lookup on each MIC device to find the adress of the function
+ * named FUNC_NAME, store them in the global array kernels and return
+ * the index in the array through SYMBOL.
+ * If success, returns 0. If the user has registered too many kernels (more
+ * than STARPU_MAXMICDEVS) returns -ENOMEM
+ */
+int starpu_mic_register_kernel(starpu_mic_func_symbol_t *symbol,
+			       const char *func_name)
+{
+	return _starpu_mic_src_register_kernel(symbol, func_name);
+}
+
+/* If success, return the pointer to the function defined by SYMBOL on the
+ * device linked to the called 
+ * device.
+ */
+starpu_mic_kernel_t starpu_mic_get_kernel(starpu_mic_func_symbol_t symbol)
+{
+	return _starpu_mic_src_get_kernel(symbol);
+}

+ 234 - 0
src/drivers/mp_common/mp_common.c

@@ -0,0 +1,234 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012  Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <stdlib.h>
+#include <pthread.h>
+
+#include <drivers/mp_common/mp_common.h>
+#include <drivers/mp_common/sink_common.h>
+#include <drivers/mic/driver_mic_common.h>
+#include <drivers/mic/driver_mic_source.h>
+#include <drivers/mic/driver_mic_sink.h>
+#include <drivers/scc/driver_scc_common.h>
+#include <drivers/scc/driver_scc_source.h>
+#include <drivers/scc/driver_scc_sink.h>
+
+/* Allocate and initialize the sink structure, when the function returns
+ * all the pointer of functions are linked to the right ones.
+ */
+struct _starpu_mp_node * __attribute__((malloc))
+    _starpu_mp_common_node_create(enum _starpu_mp_node_kind node_kind,
+				  int peer_id)
+{
+	struct _starpu_mp_node *node;
+
+	node = (struct _starpu_mp_node *) malloc(sizeof(struct _starpu_mp_node));
+
+	node->kind = node_kind;
+
+	node->peer_id = peer_id;
+
+	switch(node->kind)
+	{
+#ifdef STARPU_USE_MIC
+		case STARPU_MIC_SOURCE:
+			{
+				node->nb_mp_sinks = starpu_mic_worker_get_count();
+				node->devid = peer_id;
+
+				node->init = _starpu_mic_src_init;
+				node->deinit = _starpu_mic_src_deinit;
+				node->report_error = _starpu_mic_src_report_scif_error;
+
+				node->mp_send = _starpu_mic_common_send;
+				node->mp_recv = _starpu_mic_common_recv;
+				node->dt_send = _starpu_mic_common_dt_send;
+				node->dt_recv = _starpu_mic_common_dt_recv;
+
+				node->execute = NULL;
+				node->nbcores = NULL;
+				node->allocate = NULL;
+				node->free = NULL;
+
+				/* A source node is only working on one core,
+				 * there is no need for this function */
+				node->get_nb_core = NULL;
+			}
+			break;
+
+		case STARPU_MIC_SINK:
+			{
+				node->devid = atoi(getenv("DEVID"));;
+				node->nb_mp_sinks = atoi(getenv("NB_MIC"));
+
+				node->init = _starpu_mic_sink_init;
+				node->deinit = _starpu_mic_sink_deinit;
+				node->report_error = _starpu_mic_sink_report_error;
+
+				node->mp_send = _starpu_mic_common_send;
+				node->mp_recv = _starpu_mic_common_recv;
+				node->dt_send = _starpu_mic_common_dt_send;
+				node->dt_recv = _starpu_mic_common_dt_recv;
+
+				node->execute = _starpu_sink_common_execute;
+				node->nbcores = _starpu_sink_nbcores;
+				node->allocate = _starpu_mic_sink_allocate;
+				node->free = _starpu_mic_sink_free;
+
+				node->get_nb_core = _starpu_mic_sink_get_nb_core;
+			}
+			break;
+#endif /* STARPU_USE_MIC */
+
+#ifdef STARPU_USE_SCC
+		case STARPU_SCC_SOURCE:
+			{
+				node->init = _starpu_scc_src_init;
+				node->deinit = NULL;
+				node->report_error = _starpu_scc_common_report_rcce_error;
+
+				node->mp_send = _starpu_scc_common_send;
+				node->mp_recv = _starpu_scc_common_recv;
+				node->dt_send = _starpu_scc_common_send;
+				node->dt_recv = _starpu_scc_common_recv;
+				node->dt_send_to_device = NULL;
+				node->dt_recv_from_device = NULL;
+
+				node->execute = NULL;
+				node->allocate = NULL;
+				node->free = NULL;
+
+				node->get_nb_core = NULL;
+			}
+			break;
+
+		case STARPU_SCC_SINK:
+			{
+				node->init = _starpu_scc_sink_init;
+				node->deinit = _starpu_scc_sink_deinit;
+				node->report_error = _starpu_scc_common_report_rcce_error;
+
+				node->mp_send = _starpu_scc_common_send;
+				node->mp_recv = _starpu_scc_common_recv;
+				node->dt_send = _starpu_scc_common_send;
+				node->dt_recv = _starpu_scc_common_recv;
+				node->dt_send_to_device = _starpu_scc_sink_send_to_device;
+				node->dt_recv_from_device = _starpu_scc_sink_recv_from_device;
+
+				node->execute = _starpu_scc_sink_execute;
+				node->allocate = _starpu_sink_common_allocate;
+				node->free = _starpu_sink_common_free;
+
+				node->get_nb_core = NULL;
+			}
+			break;
+#endif /* STARPU_USE_SCC */
+
+#ifdef STARPU_USE_MPI
+		case STARPU_MPI_SOURCE:
+			STARPU_ABORT();
+			break;
+
+		case STARPU_MPI_SINK:
+			STARPU_ABORT();
+			break;
+#endif /* STARPU_USE_MPI */
+
+		default:
+			STARPU_ASSERT(0);
+	}
+
+	/* Let's allocate the buffer, we want it to be big enough to contain
+	 * a command, an argument and the argument size */
+	node->buffer = (void *) malloc(BUFFER_SIZE);
+
+	if (node->init)
+		node->init(node);
+
+	return node;
+}
+
+/* Deinitialize the sink structure and release the structure */
+
+void _starpu_mp_common_node_destroy(struct _starpu_mp_node *node)
+{
+	if (node->deinit)
+		node->deinit(node);
+
+	free(node->buffer);
+
+	free(node);
+}
+
+/* Send COMMAND to RECIPIENT, along with ARG if ARG_SIZE is non-zero */
+
+void _starpu_mp_common_send_command(const struct _starpu_mp_node *node,
+				    const enum _starpu_mp_command command,
+				    void *arg, int arg_size)
+{
+	STARPU_ASSERT(arg_size <= BUFFER_SIZE);
+
+	/* MIC and MPI sizes are given through a int */
+	int command_size = sizeof(enum _starpu_mp_command);
+	int arg_size_size = sizeof(int);
+
+	/* Let's copy the data into the command line buffer */
+	memcpy(node->buffer, &command, command_size);
+	memcpy(node->buffer + command_size, &arg_size, arg_size_size);
+
+	node->mp_send(node, node->buffer, command_size + arg_size_size);
+
+	if (arg_size)
+		node->mp_send(node, arg, arg_size);
+}
+
+/* Return the command received from SENDER. In case SENDER sent an argument
+ * beside the command, an address to a copy of this argument is returns in arg.
+ * There is no need to free this address as it's not allocated at this time.
+ * However, the data pointed by arg shouldn't be relied on after a new call to
+ * STARPU_MP_COMMON_RECV_COMMAND as it might corrupt it.
+ */
+
+enum _starpu_mp_command _starpu_mp_common_recv_command(const struct _starpu_mp_node *node,
+						       void **arg, int *arg_size)
+{
+	enum _starpu_mp_command command;
+
+	/* MIC and MPI sizes are given through a int */
+	int command_size = sizeof(enum _starpu_mp_command);
+	int arg_size_size = sizeof(int);
+
+	node->mp_recv(node, node->buffer, command_size + arg_size_size);
+
+	command = *((enum _starpu_mp_command *) node->buffer);
+	*arg_size = *((int *) (node->buffer + command_size));
+
+	/* If there is no argument (ie. arg_size == 0),
+	 * let's return the command right now */
+	if (!(*arg_size))
+	{
+		*arg = NULL;
+		return command;
+	}
+
+	STARPU_ASSERT(*arg_size <= BUFFER_SIZE);
+
+	node->mp_recv(node, node->buffer, *arg_size);
+
+	*arg = node->buffer;
+
+	return command;
+}

+ 178 - 0
src/drivers/mp_common/mp_common.h

@@ -0,0 +1,178 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012  Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __MP_COMMON_H__
+#define __MP_COMMON_H__
+
+#include <pthread.h>
+
+#include <starpu.h>
+#include <common/config.h>
+
+
+#ifdef STARPU_USE_MP
+
+#ifdef STARPU_USE_MIC
+#include <scif.h>
+#endif /* STARPU_USE_MIC */
+
+#define BUFFER_SIZE 256
+
+#define STARPU_MP_SRC_NODE 0
+#define STARPU_MP_SINK_NODE(a) ((a) + 1)
+
+#define STARPU_MP_COMMON_REPORT_ERROR(node, status) \
+	(node)->report_error(__starpu_func__, __FILE__, __LINE__, (status))
+
+
+enum _starpu_mp_command
+{
+	STARPU_EXIT = 0x00,
+	STARPU_EXECUTE = 0x01,
+	STARPU_ERROR_EXECUTE = 0x02,
+	STARPU_LOOKUP = 0X03,
+	STARPU_ANSWER_LOOKUP = 0X04,
+	STARPU_ERROR_LOOKUP = 0X05,
+	STARPU_ALLOCATE = 0x06,
+	STARPU_ANSWER_ALLOCATE = 0x07,
+	STARPU_ERROR_ALLOCATE = 0x08,
+	STARPU_FREE = 0x09,
+	STARPU_RECV_FROM_HOST = 0x10,
+	STARPU_SEND_TO_HOST = 0x11,
+	STARPU_RECV_FROM_SINK = 0x12,
+	STARPU_SEND_TO_SINK = 0x13,
+	STARPU_TRANSFER_COMPLETE = 0x14,
+	STARPU_SINK_NBCORES = 0x15,
+	STARPU_ANSWER_SINK_NBCORES = 0x16,
+	STARPU_EXECUTION_SUBMITTED = 0x42,
+	STARPU_EXECUTION_COMPLETED = 0x43
+};
+
+enum _starpu_mp_node_kind
+{
+	STARPU_MIC_SINK,
+	STARPU_MIC_SOURCE,
+	STARPU_SCC_SINK,
+	STARPU_SCC_SOURCE,
+	STARPU_MPI_SINK,
+	STARPU_MPI_SOURCE,
+	STARPU_INVALID_KIND
+};
+
+union _starpu_mp_connection
+{
+#ifdef STARPU_USE_MIC
+	scif_epd_t mic_endpoint;
+#endif
+#ifdef STARPU_USE_SCC
+	int scc_nodeid;
+#endif
+	int mpi_nodeid;
+};
+
+struct _starpu_mp_transfer_command
+{
+	size_t size;
+	void *addr;
+};
+
+struct _starpu_mp_transfer_command_to_device
+{
+	int devid;
+	size_t size;
+	void *addr;
+};
+
+/* Message-passing working node, whether source
+ * or sink */
+struct _starpu_mp_node
+{
+	enum _starpu_mp_node_kind kind;
+
+	/* Buffer used for scif data transfers, allocated
+	 * during node initialization.
+	 * Size : BUFFER_SIZE */
+	void *buffer;
+
+	/* For sink : -1.
+	 * For host : index of the sink = devid.
+	 */
+	int peer_id;
+
+	/* Only MIC use this for now !!
+	 * This is the devid both for the sink and the host. */
+	int devid;
+
+	/* Only MIC use this for now !!
+	*  Is the number ok MIC on the system. */
+	unsigned int nb_mp_sinks;
+
+	/* Connection used for command passing between the host thread and the
+	 * sink it controls */
+	union _starpu_mp_connection mp_connection;
+
+	/* Only MIC use this for now !!
+	 * Connection used for data transfers between the host and his sink. */
+	union _starpu_mp_connection host_sink_dt_connection;
+
+	/* Only MIC use this for now !!
+	 * Only sink use this for now !!
+	 * Connection used for data transfer between devices.
+	 * A sink opens a connection with each other sink,
+	 * thus each sink can directly send data to each other.
+	 * For sink :
+	 *  - sink_sink_dt_connections[i] is the connection to the sink number i.
+	 *  - sink_sink_dt_connections[j] is not initialized for the sink number j. */
+	union _starpu_mp_connection *sink_sink_dt_connections;
+
+	/* Node general functions */
+	void (*init)(struct _starpu_mp_node *node);
+	void (*deinit)(struct _starpu_mp_node *node);
+	void (*report_error)(const char *, const char *, const int, const int);
+
+	/* Message passing */
+	void (*mp_send)(const struct _starpu_mp_node *, void *, int);
+	void (*mp_recv)(const struct _starpu_mp_node *, void *, int);
+
+	/* Data transfers */
+	void (*dt_send)(const struct _starpu_mp_node *, void *, int);
+	void (*dt_recv)(const struct _starpu_mp_node *, void *, int);
+	void (*dt_send_to_device)(const struct _starpu_mp_node *, int, void *, int);
+	void (*dt_recv_from_device)(const struct _starpu_mp_node *, int, void *, int);
+
+	void (*execute)(const struct _starpu_mp_node *, void *, int);
+	void (*nbcores)(const struct _starpu_mp_node *);
+	void (*allocate)(const struct _starpu_mp_node *, void *, int);
+	void (*free)(const struct _starpu_mp_node *, void *, int);
+
+	unsigned int (*get_nb_core)(void);
+};
+
+struct _starpu_mp_node * _starpu_mp_common_node_create(enum _starpu_mp_node_kind node_kind, int peer_devid);
+
+void _starpu_mp_common_node_destroy(struct _starpu_mp_node *node);
+
+void _starpu_mp_common_send_command(const struct _starpu_mp_node *node,
+				    const enum _starpu_mp_command command,
+				    void *arg, int arg_size);
+
+enum _starpu_mp_command _starpu_mp_common_recv_command(const struct _starpu_mp_node *node,
+						    void **arg, int *arg_size);
+
+
+#endif /* STARPU_USE_MP */
+
+#endif /* __MP_COMMON_H__ */

+ 275 - 0
src/drivers/mp_common/sink_common.c

@@ -0,0 +1,275 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012  Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+
+#include <dlfcn.h>
+
+#include <common/COISysInfo_common.h>
+
+#include <starpu.h>
+#include <common/config.h>
+#include <common/utils.h>
+#include <drivers/mp_common/mp_common.h>
+
+#include "sink_common.h"
+
+/* Return the sink kind of the running process, based on the value of the
+ * STARPU_SINK environment variable.
+ * If there is no valid value retrieved, return STARPU_INVALID_KIND
+ */
+static enum _starpu_mp_node_kind _starpu_sink_common_get_kind(void)
+{
+	/* Environment varible STARPU_SINK must be defined when running on sink
+	 * side : let's use it to get the kind of node we're running on */
+	char *node_kind = getenv("STARPU_SINK");
+	STARPU_ASSERT(node_kind);
+
+	if (!strcmp(node_kind, "STARPU_MIC"))
+		return STARPU_MIC_SINK;
+	else if (!strcmp(node_kind, "STARPU_SCC"))
+		return STARPU_SCC_SINK;
+	else if (!strcmp(node_kind, "STARPU_MPI"))
+		return STARPU_MPI_SINK;
+	else
+		return STARPU_INVALID_KIND;
+}
+
+void
+_starpu_sink_nbcores (const struct _starpu_mp_node *node)
+{
+    // Process packet received from `_starpu_src_common_sink_cores'.
+
+    // I currently only support MIC for now.
+    int nbcores = 0;
+    if (STARPU_MIC_SINK == _starpu_sink_common_get_kind ())
+	nbcores = COISysGetCoreCount();
+
+    _starpu_mp_common_send_command (node, STARPU_ANSWER_SINK_NBCORES,
+				    &nbcores, sizeof (int));
+}
+
+
+/* Receive paquet from _starpu_src_common_execute_kernel in the form below :
+ * [Function pointer on sink, number of interfaces, interfaces
+ * (union _starpu_interface), cl_arg]
+ * Then call the function given, passing as argument an array containing the
+ * addresses of the received interfaces
+ */
+void _starpu_sink_common_execute(const struct _starpu_mp_node *node,
+					void *arg, int arg_size)
+{
+	unsigned id = 0;
+
+	void *arg_ptr = arg;
+	void (*kernel)(void **, void *) = NULL;
+	unsigned coreid = 0;
+	unsigned nb_interfaces = 0;
+	void *interfaces[STARPU_NMAXBUFS];
+	void *cl_arg;
+
+	kernel = *(void(**)(void **, void *)) arg_ptr;
+	arg_ptr += sizeof(kernel);
+
+	coreid = *(unsigned *) arg_ptr;
+	arg_ptr += sizeof(coreid);
+
+	nb_interfaces = *(unsigned *) arg_ptr;
+	arg_ptr += sizeof(nb_interfaces);
+
+	/* The function needs an array pointing to each interface it needs
+	 * during execution. As in sink-side there is no mean to know which
+	 * kind of interface to expect, the array is composed of unions of
+	 * interfaces, thus we expect the same size anyway */
+	for (id = 0; id < nb_interfaces; id++)
+	{
+		interfaces[id] = arg_ptr;
+		arg_ptr += sizeof(union _starpu_interface);
+	}
+
+	/* Was cl_arg sent ? */
+	if (arg_size > arg_ptr - arg)
+		cl_arg = arg_ptr;
+	else
+		cl_arg = NULL;
+
+	/* XXX: in the future, we will not have to directly execute the kernel
+	 * but submit it to the correct local worker. */
+	_starpu_mp_common_send_command(node, STARPU_EXECUTION_SUBMITTED,
+				       NULL, 0);
+
+	/* XXX: we keep the synchronous execution model on the sink side for
+	 * now. */
+	kernel(interfaces, cl_arg);
+
+	_starpu_mp_common_send_command(node, STARPU_EXECUTION_COMPLETED,
+				       &coreid, sizeof(coreid));
+}
+
+
+static void _starpu_sink_common_lookup(const struct _starpu_mp_node *node,
+				       char *func_name)
+{
+	void (*func)(void);
+	void *dl_handle = dlopen(NULL, RTLD_NOW);
+	func = dlsym(dl_handle, func_name);
+
+
+	/* If we couldn't find the function, let's send an error to the host.
+	 * The user probably made a mistake in the name */
+	if (func)
+		_starpu_mp_common_send_command(node, STARPU_ANSWER_LOOKUP,
+					       &func, sizeof(func));
+	else
+		_starpu_mp_common_send_command(node, STARPU_ERROR_LOOKUP,
+					       NULL, 0);
+}
+
+void _starpu_sink_common_allocate(const struct _starpu_mp_node *mp_node,
+				  void *arg, int arg_size)
+{
+    STARPU_ASSERT(arg_size == sizeof(size_t));
+
+    void *addr = malloc(*(size_t *)(arg));
+
+    /* If the allocation fail, let's send an error to the host.
+     */
+    if (addr)
+	_starpu_mp_common_send_command(mp_node, STARPU_ANSWER_ALLOCATE,
+				       &addr, sizeof(addr));
+    else
+	_starpu_mp_common_send_command(mp_node, STARPU_ERROR_ALLOCATE,
+				       NULL, 0);
+}
+
+void _starpu_sink_common_free(const struct _starpu_mp_node *mp_node STARPU_ATTRIBUTE_UNUSED,
+			      void *arg, int arg_size)
+{
+	STARPU_ASSERT(arg_size == sizeof(void *));
+
+	free(*(void **)(arg));
+}
+
+static void _starpu_sink_common_copy_from_host(const struct _starpu_mp_node *mp_node,
+					       void *arg, int arg_size)
+{
+    STARPU_ASSERT(arg_size == sizeof(struct _starpu_mp_transfer_command));
+
+    struct _starpu_mp_transfer_command *cmd = (struct _starpu_mp_transfer_command *)arg;
+
+    mp_node->dt_recv(mp_node, cmd->addr, cmd->size);
+}
+
+static void _starpu_sink_common_copy_to_host(const struct _starpu_mp_node *mp_node,
+					     void *arg, int arg_size)
+{
+    STARPU_ASSERT(arg_size == sizeof(struct _starpu_mp_transfer_command));
+
+    struct _starpu_mp_transfer_command *cmd = (struct _starpu_mp_transfer_command *)arg;
+
+    mp_node->dt_send(mp_node, cmd->addr, cmd->size);
+}
+
+static void _starpu_sink_common_copy_from_sink(const struct _starpu_mp_node *mp_node,
+					       void *arg, int arg_size)
+{
+    STARPU_ASSERT(arg_size == sizeof(struct _starpu_mp_transfer_command_to_device));
+
+    struct _starpu_mp_transfer_command_to_device *cmd = (struct _starpu_mp_transfer_command_to_device *)arg;
+
+    mp_node->dt_recv_from_device(mp_node, cmd->devid, cmd->addr, cmd->size);
+
+    _starpu_mp_common_send_command(mp_node, STARPU_TRANSFER_COMPLETE, NULL, 0);
+}
+
+static void _starpu_sink_common_copy_to_sink(const struct _starpu_mp_node *mp_node,
+					     void *arg, int arg_size)
+{
+    STARPU_ASSERT(arg_size == sizeof(struct _starpu_mp_transfer_command_to_device));
+
+    struct _starpu_mp_transfer_command_to_device *cmd = (struct _starpu_mp_transfer_command_to_device *)arg;
+
+    mp_node->dt_send_to_device(mp_node, cmd->devid, cmd->addr, cmd->size);
+}
+
+/* Function looping on the sink, waiting for tasks to execute.
+ * If the caller is the host, don't do anything.
+ */
+
+void _starpu_sink_common_worker(void)
+{
+	struct _starpu_mp_node *node = NULL;
+	enum _starpu_mp_command command = STARPU_EXIT;
+	int arg_size = 0;
+	void *arg = NULL;
+
+	enum _starpu_mp_node_kind node_kind = _starpu_sink_common_get_kind();
+
+	if (node_kind == STARPU_INVALID_KIND)
+		_STARPU_ERROR("No valid sink kind retrieved, use the"
+			      "STARPU_SINK environment variable to specify"
+			      "this\n");
+
+	/* Create and initialize the node */
+	node = _starpu_mp_common_node_create(node_kind, -1);
+
+	while ((command = _starpu_mp_common_recv_command(node, &arg, &arg_size)) != STARPU_EXIT)
+	{
+		switch(command)
+		{
+			case STARPU_EXECUTE:
+				node->execute(node, arg, arg_size);
+				break;
+			case STARPU_SINK_NBCORES:
+				node->nbcores (node);
+				break;
+			case STARPU_LOOKUP:
+				_starpu_sink_common_lookup(node, (char *) arg);
+				break;
+
+			case STARPU_ALLOCATE:
+				node->allocate(node, arg, arg_size);
+				break;
+
+			case STARPU_FREE:
+				node->free(node, arg, arg_size);
+				break;
+
+			case STARPU_RECV_FROM_HOST:
+				_starpu_sink_common_copy_from_host(node, arg, arg_size);
+				break;
+
+			case STARPU_SEND_TO_HOST:
+				_starpu_sink_common_copy_to_host(node, arg, arg_size);
+				break;
+
+			case STARPU_RECV_FROM_SINK:
+				_starpu_sink_common_copy_from_sink(node, arg, arg_size);
+				break;
+
+			case STARPU_SEND_TO_SINK:
+				_starpu_sink_common_copy_to_sink(node, arg, arg_size);
+				break;
+
+			default:
+				printf("Oops, command %x unrecognized\n", command);
+		}
+	}
+
+	/* Deinitialize the node and release it */
+	_starpu_mp_common_node_destroy(node);
+
+	exit(0);
+}

+ 39 - 0
src/drivers/mp_common/sink_common.h

@@ -0,0 +1,39 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012  Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+
+#ifndef __SINK_COMMON_H__
+#define __SINK_COMMON_H__
+
+#include <common/config.h>
+
+
+#ifdef STARPU_USE_MP
+
+#include <drivers/mp_common/mp_common.h>
+
+void _starpu_sink_common_worker(void);
+
+void _starpu_sink_common_execute(const struct _starpu_mp_node *node, void *arg, int arg_size);
+void _starpu_sink_nbcores (const struct _starpu_mp_node *node);
+
+void _starpu_sink_common_allocate(const struct _starpu_mp_node *mp_node, void *arg, int arg_size);
+void _starpu_sink_common_free(const struct _starpu_mp_node *mp_node STARPU_ATTRIBUTE_UNUSED, void *arg, int arg_size);
+
+#endif /* STARPU_USE_MP */
+
+
+#endif /* __SINK_COMMON_H__ */

+ 368 - 0
src/drivers/mp_common/source_common.c

@@ -0,0 +1,368 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012  Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+
+#include <string.h>
+#include <pthread.h>
+
+#include <starpu.h>
+#include <datawizard/coherency.h>
+#include <drivers/mp_common/mp_common.h>
+
+int
+_starpu_src_common_sink_nbcores (const struct _starpu_mp_node *node, int *buf)
+{
+    // Send a request to the sink NODE for the number of cores on it.
+
+    enum _starpu_mp_command answer;
+    void *arg;
+    int arg_size = sizeof (int);
+
+    _starpu_mp_common_send_command (node, STARPU_SINK_NBCORES, NULL, 0);
+
+    answer = _starpu_mp_common_recv_command (node, &arg, &arg_size);
+
+    STARPU_ASSERT (answer == STARPU_ANSWER_SINK_NBCORES && arg_size == sizeof (int));
+
+    memcpy (buf, arg, arg_size);
+
+    return 0;
+}
+
+/* Send a request to the sink linked to NODE for the pointer to the
+ * function defined by FUNC_NAME.
+ * In case of success, it returns 0 and FUNC_PTR contains the pointer ;
+ * else it returns -ESPIPE if the function was not found.
+ */
+int _starpu_src_common_lookup(struct _starpu_mp_node *node,
+			      void (**func_ptr)(void), const char *func_name)
+{
+	enum _starpu_mp_command answer;
+	void *arg;
+	int arg_size;
+
+	/* strlen ignore the terminating '\0' */
+	arg_size = (strlen(func_name) + 1) * sizeof(char);
+
+	_starpu_mp_common_send_command(node, STARPU_LOOKUP, (void *) func_name,
+				       arg_size);
+	answer = _starpu_mp_common_recv_command(node, (void **) &arg,
+						&arg_size);
+
+	if (answer == STARPU_ERROR_LOOKUP)
+		return -ESPIPE;
+
+	/* We have to be sure the device answered the right question and the
+	 * answer has the right size */
+	STARPU_ASSERT(answer == STARPU_ANSWER_LOOKUP &&
+		      arg_size == sizeof(*func_ptr));
+
+	memcpy(func_ptr, arg, arg_size);
+
+	return 0;
+}
+
+ /* Send a message to the sink to execute a kernel.
+ * The message sent has the form below :
+ * [Function pointer on sink, number of interfaces, interfaces
+ * (union _starpu_interface), cl_arg]
+ */
+int _starpu_src_common_execute_kernel(const struct _starpu_mp_node *node,
+				      void (*kernel)(void), unsigned coreid,
+				      starpu_data_handle_t *handles,
+				      void **interfaces,
+				      unsigned nb_interfaces,
+				      void *cl_arg, size_t cl_arg_size)
+{
+	unsigned id;
+	void *buffer, *buffer_ptr, *arg = NULL;
+	int buffer_size = 0, arg_size = 0;
+
+	/* If the user didn't give any cl_arg, there is no need to send it */
+	buffer_size =
+	    sizeof(kernel) + sizeof(coreid) + sizeof(nb_interfaces) +
+	    nb_interfaces * sizeof(union _starpu_interface);
+	if (cl_arg)
+	{
+		STARPU_ASSERT(cl_arg_size);
+		buffer_size += cl_arg_size;
+	}
+
+	/* We give to send_command a buffer we just allocated, which contains
+	 * a pointer to the function (sink-side), core on which execute this
+	 * function (sink-side), number of interfaces we send,
+	 * an array of generic (union) interfaces and the value of cl_arg */
+	buffer_ptr = buffer = (void *) malloc(buffer_size);
+
+	*(void(**)(void)) buffer = kernel;
+	buffer_ptr += sizeof(kernel);
+
+	*(unsigned *) buffer_ptr = coreid;
+	buffer_ptr += sizeof(coreid);
+
+	*(unsigned *) buffer_ptr = nb_interfaces;
+	buffer_ptr += sizeof(nb_interfaces);
+
+	/* Message-passing execution is a particular case as the codelet is
+	 * executed on a sink with a different memory, whereas a codelet is
+	 * executed on the host part for the other accelerators.
+	 * Thus we need to send a copy of each interface on the MP device */
+	for (id = 0; id < nb_interfaces; id++)
+	{
+		starpu_data_handle_t handle = handles[id];
+		memcpy (buffer_ptr, interfaces[id],
+			handle->ops->interface_size);
+		/* The sink side has no mean to get the type of each
+		 * interface, we use a union to make it generic and permit the
+		 * sink to go through the array */
+		buffer_ptr += sizeof(union _starpu_interface);
+	}
+
+	if (cl_arg)
+		memcpy(buffer_ptr, cl_arg, cl_arg_size);
+
+	_starpu_mp_common_send_command(node, STARPU_EXECUTE, buffer, buffer_size);
+	enum _starpu_mp_command answer = _starpu_mp_common_recv_command(node, &arg, &arg_size);
+
+	if (answer == STARPU_ERROR_EXECUTE)
+		return -EINVAL;
+
+	STARPU_ASSERT(answer == STARPU_EXECUTION_SUBMITTED);
+
+	free(buffer);
+
+	return 0;
+
+}
+
+/* Launch the execution of the function KERNEL points to on the sink linked
+ * to NODE. Returns 0 in case of success, -EINVAL if kernel is an invalid
+ * pointer.
+ * Data interfaces in task are send to the sink.
+ */
+int _starpu_src_common_execute_kernel_from_task(const struct _starpu_mp_node *node,
+						void (*kernel)(void), unsigned coreid,
+						struct starpu_task *task)
+{
+    return _starpu_src_common_execute_kernel(node, kernel, coreid,
+					     task->handles, task->interfaces, task->cl->nbuffers,
+					     task->cl_arg, task->cl_arg_size);
+}
+
+/* Send a request to the sink linked to the MP_NODE to allocate SIZE bytes on
+ * the sink.
+ * In case of success, it returns 0 and *ADDR contains the address of the
+ * allocated area ;
+ * else it returns 1 if the allocation fail.
+ */
+int _starpu_src_common_allocate(const struct _starpu_mp_node *mp_node,
+								void **addr, size_t size)
+{
+	enum _starpu_mp_command answer;
+	void *arg;
+	int arg_size;
+
+	_starpu_mp_common_send_command(mp_node, STARPU_ALLOCATE, &size,
+								   sizeof(size));
+
+	answer = _starpu_mp_common_recv_command(mp_node, &arg, &arg_size);
+
+	if (answer == STARPU_ERROR_ALLOCATE)
+		return 1;
+
+	STARPU_ASSERT(answer == STARPU_ANSWER_ALLOCATE &&
+				  arg_size == sizeof(*addr));
+
+	memcpy(addr, arg, arg_size);
+
+	return 0;
+}
+
+/* Send a request to the sink linked to the MP_NODE to deallocate the memory
+ * area pointed by ADDR.
+ */
+void _starpu_src_common_free(const struct _starpu_mp_node *mp_node,
+							 void *addr)
+{
+	_starpu_mp_common_send_command(mp_node, STARPU_FREE, &addr, sizeof(addr));
+}
+
+/* Send SIZE bytes pointed by SRC to DST on the sink linked to the MP_NODE.
+ */
+int _starpu_src_common_copy_host_to_sink(const struct _starpu_mp_node *mp_node,
+										 void *src, void *dst, size_t size)
+{
+	struct _starpu_mp_transfer_command cmd = {size, dst};
+
+	_starpu_mp_common_send_command(mp_node, STARPU_RECV_FROM_HOST, &cmd, sizeof(cmd));
+	mp_node->dt_send(mp_node, src, size);
+
+	return 0;
+}
+
+/* Receive SIZE bytes pointed by SRC on the sink linked to the MP_NODE and store them in DST.
+ */
+int _starpu_src_common_copy_sink_to_host(const struct _starpu_mp_node *mp_node,
+										 void *src, void *dst, size_t size)
+{
+	struct _starpu_mp_transfer_command cmd = {size, src};
+
+	_starpu_mp_common_send_command(mp_node, STARPU_SEND_TO_HOST, &cmd, sizeof(cmd));
+	mp_node->dt_recv(mp_node, dst, size);
+
+	return 0;
+}
+
+/* Tell the sink linked to SRC_NODE to send SIZE bytes of data pointed by SRC
+ * to the sink linked to DST_NODE. The latter store them in DST.
+ */
+int _starpu_src_common_copy_sink_to_sink(const struct _starpu_mp_node *src_node,
+		const struct _starpu_mp_node *dst_node, void *src, void *dst, size_t size)
+{
+	enum _starpu_mp_command answer;
+	void *arg;
+	int arg_size;
+
+	struct _starpu_mp_transfer_command_to_device cmd = {dst_node->peer_id, size, src};
+
+	/* Tell source to send data to dest. */
+	_starpu_mp_common_send_command(src_node, STARPU_SEND_TO_SINK, &cmd, sizeof(cmd));
+
+	cmd.devid = src_node->peer_id;
+	cmd.size = size;
+	cmd.addr = dst;
+
+	/* Tell dest to receive data from source. */
+	_starpu_mp_common_send_command(dst_node, STARPU_RECV_FROM_SINK, &cmd, sizeof(cmd));
+
+	/* Wait for answer from dest to know wether transfer is finished. */
+	answer = _starpu_mp_common_recv_command(dst_node, &arg, &arg_size);
+
+	STARPU_ASSERT(answer == STARPU_TRANSFER_COMPLETE);
+
+	return 0;
+}
+
+/* 5 functions to determine the executable to run on the device (MIC, SCC,
+ * MPI).
+ */
+static void _starpu_src_common_cat_3(char *final, const char *first, const char *second,
+										  const char *third)
+{
+	strcpy(final, first);
+	strcat(final, second);
+	strcat(final, third);
+}
+
+static void _starpu_src_common_cat_2(char *final, const char *first, const char *second)
+{
+	_starpu_src_common_cat_3(final, first, second, "");
+}
+
+static void _starpu_src_common_dir_cat(char *final, const char *dir, const char *file)
+{
+	if (file[0] == '/')
+		++file;
+
+	size_t size = strlen(dir);
+	if (dir[size - 1] == '/')
+		_starpu_src_common_cat_2(final, dir, file);
+	else
+		_starpu_src_common_cat_3(final, dir, "/", file);
+}
+
+static int _starpu_src_common_test_suffixes(char *located_file_name, const char *base, const char **suffixes)
+{
+	unsigned int i;
+	for (i = 0; suffixes[i] != NULL; ++i)
+	{
+		_starpu_src_common_cat_2(located_file_name, base, suffixes[i]);
+		if (access(located_file_name, R_OK) == 0)
+			return 0;
+	}
+
+	return 1;
+}
+
+int _starpu_src_common_locate_file(char *located_file_name,
+							const char *env_file_name, const char *env_mic_path,
+							const char *config_file_name, const char *actual_file_name,
+							const char **suffixes)
+{
+	if (env_file_name != NULL)
+	{
+		if (access(env_file_name, R_OK) == 0)
+		{
+			strcpy(located_file_name, env_file_name);
+			return 0;
+		}
+		else if(env_mic_path != NULL)
+		{
+			_starpu_src_common_dir_cat(located_file_name, env_mic_path, env_file_name);
+
+			return access(located_file_name, R_OK);
+		}
+	}
+	else if (config_file_name != NULL)
+	{
+		if (access(config_file_name, R_OK) == 0)
+		{
+			strcpy(located_file_name, config_file_name);
+			return 0;
+		}
+		else if (env_mic_path != NULL)
+		{
+			_starpu_src_common_dir_cat(located_file_name, env_mic_path, config_file_name);
+
+			return access(located_file_name, R_OK);
+		}
+	}
+	else if (actual_file_name != NULL)
+	{
+		if (_starpu_src_common_test_suffixes(located_file_name, actual_file_name, suffixes) == 0)
+			return 0;
+
+		if (env_mic_path != NULL)
+		{
+			char actual_cpy[1024];
+			strcpy(actual_cpy, actual_file_name);
+
+			char *last =  strrchr(actual_cpy, '/');
+			while (last != NULL)
+			{
+				char tmp[1024];
+
+				_starpu_src_common_dir_cat(tmp, env_mic_path, last);
+
+				if (access(tmp, R_OK) == 0)
+				{
+					strcpy(located_file_name, tmp);
+					return 0;
+				}
+
+				if (_starpu_src_common_test_suffixes(located_file_name, tmp, suffixes) == 0)
+					return 0;
+
+				*last = '\0';
+				char *last_tmp = strrchr(actual_cpy, '/');
+				*last = '/';
+				last = last_tmp;
+			}
+		}
+	}
+
+	return 1;
+}

+ 63 - 0
src/drivers/mp_common/source_common.h

@@ -0,0 +1,63 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012  Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+
+#ifndef __SOURCE_COMMON_H__
+#define __SOURCE_COMMON_H__
+
+
+#ifdef STARPU_USE_MP
+
+#include <drivers/mp_common/mp_common.h>
+
+int _starpu_src_common_sink_nbcores (const struct _starpu_mp_node *node, int *buf);
+
+int _starpu_src_common_lookup(const struct _starpu_mp_node *node,
+			      void (**func_ptr)(void), const char *func_name);
+
+int _starpu_src_common_execute_kernel(const struct _starpu_mp_node *node,
+				      void (*kernel)(void), unsigned coreid,
+				      starpu_data_handle_t *handles, void **interfaces, unsigned nb_interfaces,
+				      void *cl_arg, size_t cl_arg_size);
+
+int _starpu_src_common_execute_kernel_from_task(const struct _starpu_mp_node *node,
+						void (*kernel)(void), unsigned coreid,
+						struct starpu_task *task);
+
+int _starpu_src_common_allocate(const struct _starpu_mp_node *mp_node,
+				void **addr, size_t size);
+
+void _starpu_src_common_free(const struct _starpu_mp_node *mp_node,
+			     void *addr);
+
+int _starpu_src_common_copy_host_to_sink(const struct _starpu_mp_node *mp_node,
+					 void *src, void *dst, size_t size);
+
+int _starpu_src_common_copy_sink_to_host(const struct _starpu_mp_node *mp_node,
+					 void *src, void *dst, size_t size);
+
+int _starpu_src_common_copy_sink_to_sink(const struct _starpu_mp_node *src_node,
+					 const struct _starpu_mp_node *dst_node, void *src, void *dst, size_t size);
+
+int _starpu_src_common_locate_file(char *located_file_name,
+				   const char *env_file_name, const char *env_mic_path,
+				   const char *config_file_name, const char *actual_file_name,
+				   const char **suffixes);
+
+#endif /* STARPU_USE_MP */
+
+
+#endif /* __SOURCE_COMMON_H__ */

+ 174 - 0
src/drivers/scc/driver_scc_common.c

@@ -0,0 +1,174 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012  Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#include <sys/mman.h>
+#include <fcntl.h>
+
+#include <core/workers.h>
+
+#include <drivers/mp_common/mp_common.h>
+#include <drivers/scc/driver_scc_common.h>
+
+#include <RCCE_lib.h>
+#include <SCC_API.h>
+
+static int rcce_initialized;
+
+static int src_node_id;
+
+static t_vcharp rckncm_map;
+static t_vcharp shm_addr;
+
+
+static void _starpu_scc_set_src_node_id()
+{
+	int node_id = starpu_get_env_number("STARPU_SCC_MASTER_NODE");
+
+	if (node_id != -1)
+	{
+		if (node_id < RCCE_num_ues())
+		{
+			src_node_id = node_id;
+			return;
+		}
+		else if (RCCE_ue() == 0)
+		{
+			/* Only node 0 print the error message. */
+			fprintf(stderr, "The node you specify to be the master is "
+					"greater than the total number of nodes.\n"
+					"Taking node 0 (core %d) by default...\n", RC_COREID[0]);
+		}
+	}
+
+	/* Node 0 by default. */
+	src_node_id = 0;
+}
+
+/* Try to init the RCCE API.
+ * return: 	1 on success
+ * 			0 on failure
+ */
+int _starpu_scc_common_mp_init()
+{
+	int rckncm_fd;
+
+	/* "/dev/rckncm" is to access shared memory on SCC. */
+	if ((rckncm_fd = open("/dev/rckncm", O_RDWR | O_SYNC)) < 0)
+	{
+		/* It seems that we're not on a SCC system. */
+		return (rcce_initialized = 0);
+	}
+
+	int page_size = getpagesize();
+	unsigned int aligne_addr = (SHM_ADDR) & (~(page_size - 1));
+	if ((rckncm_map = (t_vcharp)mmap(NULL, SHMSIZE, PROT_WRITE | PROT_READ, MAP_SHARED,
+					rckncm_fd, aligne_addr)) == MAP_FAILED)
+	{
+		perror("mmap");
+		close(rckncm_fd);
+		return (rcce_initialized = 0);
+	}
+
+	int *argc = _starpu_get_argc();
+	char ***argv = _starpu_get_argv();
+
+	/* We can't initialize RCCE without argc and argv. */
+	if (!argc || *argc <= 1 || !argv || (RCCE_init(argc, argv) != RCCE_SUCCESS))
+	{
+		close(rckncm_fd);
+		munmap((void*)rckncm_map, SHMSIZE);
+		return (rcce_initialized = 0);
+	}
+
+	unsigned int page_offset = (SHM_ADDR) - aligne_addr;
+	shm_addr = rckncm_map + page_offset;
+
+	RCCE_shmalloc_init(shm_addr, RCCE_SHM_SIZE_MAX);
+
+	/* Which core of the SCC will be the master one? */
+	_starpu_scc_set_src_node_id();
+
+	close(rckncm_fd);
+
+	return (rcce_initialized = 1);
+}
+
+void *_starpu_scc_common_get_shared_memory_addr()
+{
+	return (void*)shm_addr;
+}
+
+void _starpu_scc_common_unmap_shared_memory()
+{
+	munmap((void*)rckncm_map, SHMSIZE);
+}
+
+/* To know if the pointer "ptr" points into the shared memory map */
+int _starpu_scc_common_is_in_shared_memory(void *ptr)
+{
+	return (void*)shm_addr <= ptr && ptr < (void*)shm_addr + SHMSIZE;
+}
+
+int _starpu_scc_common_is_mp_initialized()
+{
+	return rcce_initialized;
+}
+
+int _starpu_scc_common_get_src_node_id()
+{
+	return src_node_id;
+}
+
+int _starpu_scc_common_is_src_node()
+{
+	return RCCE_ue() == src_node_id;
+}
+
+void _starpu_scc_common_send(const struct _starpu_mp_node *node, void *msg, int len)
+{
+	int ret;
+
+	/* There are potentially 48 threads running on the master core and RCCE_send write
+	 * data in the MPB associated to this core. It's not thread safe, so we have to protect it.
+	 * RCCE_acquire_lock uses a test&set register on SCC. */
+	RCCE_acquire_lock(RCCE_ue());
+
+	if ((ret = RCCE_send(msg, len, node->mp_connection.scc_nodeid)) != RCCE_SUCCESS)
+	{
+		RCCE_release_lock(RCCE_ue());
+		STARPU_MP_COMMON_REPORT_ERROR(node, ret);
+	}
+
+	RCCE_release_lock(RCCE_ue());
+}
+
+void _starpu_scc_common_recv(const struct _starpu_mp_node *node, void *msg, int len)
+{
+	int ret;
+	if ((ret = RCCE_recv(msg, len, node->mp_connection.scc_nodeid)) != RCCE_SUCCESS)
+		STARPU_MP_COMMON_REPORT_ERROR(node, ret);
+}
+
+void _starpu_scc_common_report_rcce_error(const char *func, const char *file, const int line, const int err_no)
+{
+	char error_string[RCCE_MAX_ERROR_STRING];
+	int error_string_length;
+
+	RCCE_error_string(err_no, error_string, &error_string_length); 
+
+	fprintf(stderr, "RCCE error in %s (%s:%d): %s\n", func, file, line, error_string); 
+	STARPU_ABORT();
+}

+ 50 - 0
src/drivers/scc/driver_scc_common.h

@@ -0,0 +1,50 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012  Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __DRIVER_SCC_COMMON_H__
+#define __DRIVER_SCC_COMMON_H__
+
+#include <common/config.h>
+
+
+#ifdef STARPU_USE_SCC
+
+#include <RCCE_lib.h>
+
+#include <drivers/mp_common/mp_common.h>
+
+#define STARPU_TO_SCC_SINK_ID(id) (id) < RCCE_ue() ? (id) : ((id) + 1)
+
+int _starpu_scc_common_mp_init();
+
+void *_starpu_scc_common_get_shared_memory_addr();
+void _starpu_scc_common_unmap_shared_memory();
+int _starpu_scc_common_is_in_shared_memory(void *ptr);
+
+int _starpu_scc_common_is_mp_initialized();
+
+int _starpu_scc_common_get_src_node_id();
+int _starpu_scc_common_is_src_node();
+
+void _starpu_scc_common_send(const struct _starpu_mp_node *node, void *msg, int len);
+void _starpu_scc_common_recv(const struct _starpu_mp_node *node, void *msg, int len);
+
+void _starpu_scc_common_report_rcce_error(const char *func, const char *file, const int line, const int err_no);
+
+#endif /* STARPU_USE_SCC */
+
+
+#endif /* __DRIVER_SCC_COMMON_H__ */

+ 125 - 0
src/drivers/scc/driver_scc_sink.c

@@ -0,0 +1,125 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012  Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+
+#include <RCCE.h>
+
+#include <drivers/mp_common/sink_common.h>
+#include <drivers/scc/driver_scc_common.h>
+#include <drivers/scc/driver_scc_sink.h>
+
+
+
+void _starpu_scc_sink_init(struct _starpu_mp_node *node)
+{
+	node->mp_connection.scc_nodeid = _starpu_scc_common_get_src_node_id();
+}
+
+void _starpu_scc_sink_deinit(struct _starpu_mp_node *node)
+{
+	(void)node;
+
+	_starpu_scc_common_unmap_shared_memory();
+	RCCE_finalize();
+}
+
+void _starpu_scc_sink_send_to_device(const struct _starpu_mp_node *node, int dst_devid, void *msg, int len)
+{
+	int ret;
+	if ((ret = RCCE_send(msg, len, STARPU_TO_SCC_SINK_ID(dst_devid))) != RCCE_SUCCESS)
+		STARPU_MP_COMMON_REPORT_ERROR(node, ret);
+}
+
+void _starpu_scc_sink_recv_from_device(const struct _starpu_mp_node *node, int src_devid, void *msg, int len)
+{
+	int ret;
+	if ((ret = RCCE_recv(msg, len, STARPU_TO_SCC_SINK_ID(src_devid))) != RCCE_SUCCESS)
+		STARPU_MP_COMMON_REPORT_ERROR(node, ret);
+}
+
+/* arg -> [Function pointer on sink, number of interfaces, interfaces
+ * (union _starpu_interface), cl_arg]
+ *
+ * This function change the dev_handle and the ptr of each interfaces
+ * given to the sink.
+ * dev_handle 	-> 	start of the shared memory (different for each sink)
+ * ptr 			-> 	dev_handle + offset
+ */
+void _starpu_scc_sink_execute(const struct _starpu_mp_node *node, void *arg, int arg_size)
+{
+	void *local_arg = arg;
+
+	/* point after the kernel */
+	local_arg += sizeof(void(*)(void**, void*));
+
+	unsigned nb_interfaces = *(unsigned*)local_arg;
+	local_arg += sizeof(nb_interfaces);
+
+	uintptr_t shm_addr = (uintptr_t)_starpu_scc_common_get_shared_memory_addr();
+
+	unsigned i;
+	for (i = 0; i < nb_interfaces; ++i)
+	{
+		/* The first field of an interface is the interface id. */
+		switch (*(enum starpu_data_interface_id *)local_arg)
+		{
+			case STARPU_MATRIX_INTERFACE_ID:
+			{
+				struct starpu_matrix_interface *matrix = (struct starpu_matrix_interface *)local_arg;
+				matrix->dev_handle = shm_addr;
+				matrix->ptr = matrix->dev_handle + matrix->offset;
+				break;
+			}
+
+			case STARPU_BLOCK_INTERFACE_ID:
+			{
+				struct starpu_block_interface *block = (struct starpu_block_interface *)local_arg;
+				block->dev_handle = shm_addr;
+				block->ptr = block->dev_handle + block->offset;
+				break;
+			}
+
+			case STARPU_VECTOR_INTERFACE_ID:
+			{
+				struct starpu_vector_interface *vector = (struct starpu_vector_interface *)local_arg;
+				vector->dev_handle = shm_addr;
+				vector->ptr = vector->dev_handle + vector->offset;
+				break;
+			}
+
+			case STARPU_VARIABLE_INTERFACE_ID:
+			{
+				struct starpu_variable_interface *variable = (struct starpu_variable_interface *)local_arg;
+				variable->dev_handle = shm_addr;
+				variable->ptr = variable->dev_handle + variable->offset;
+				break;
+			}
+
+			case STARPU_CSR_INTERFACE_ID:
+			case STARPU_BCSR_INTERFACE_ID:
+			case STARPU_MULTIFORMAT_INTERFACE_ID:
+			fprintf(stderr, "Data type not supported on SCC.\n");
+
+			default:
+				STARPU_ABORT();
+		}
+
+		/* point to the next interface */
+		local_arg += sizeof(union _starpu_interface);
+	}
+
+	_starpu_sink_common_execute(node, arg, arg_size);
+}

+ 38 - 0
src/drivers/scc/driver_scc_sink.h

@@ -0,0 +1,38 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012  Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __DRIVER_SCC_SINK_H__
+#define __DRIVER_SCC_SINK_H__
+
+#include <common/config.h>
+
+
+#ifdef STARPU_USE_SCC
+
+#include <drivers/mp_common/mp_common.h>
+
+void _starpu_scc_sink_init(struct _starpu_mp_node *node);
+void _starpu_scc_sink_deinit(struct _starpu_mp_node *node);
+
+void _starpu_scc_sink_send_to_device(const struct _starpu_mp_node *node, int dst_devid, void *msg, int len);
+void _starpu_scc_sink_recv_from_device(const struct _starpu_mp_node *node, int src_devid, void *msg, int len);
+
+void _starpu_scc_sink_execute(const struct _starpu_mp_node *node, void *arg, int arg_size);
+
+#endif /* STARPU_USE_SCC */
+
+
+#endif /* __DRIVER_SCC_SINK_H__ */

+ 408 - 0
src/drivers/scc/driver_scc_source.c

@@ -0,0 +1,408 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012  Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+
+#include <starpu.h>
+#include <starpu_profiling.h>
+#include <core/sched_policy.h>
+#include <core/task.h>
+
+#include <RCCE.h>
+
+#include <drivers/driver_common/driver_common.h>
+#include <drivers/mp_common/source_common.h>
+#include <drivers/scc/driver_scc_common.h>
+#include <drivers/scc/driver_scc_source.h>
+
+static struct _starpu_mp_node *scc_mp_nodes[STARPU_MAXSCCDEVS];
+
+struct _starpu_scc_kernel
+{
+	char *name;
+	starpu_scc_kernel_t func[STARPU_MAXSCCDEVS];
+};
+
+static struct _starpu_htbl kernels_htbl;
+starpu_pthread_mutex_t htbl_mutex = STARPU_PTHREAD_MUTEX_INITIALIZER;
+
+static struct _starpu_mp_node *_starpu_scc_src_memory_node_to_mp_node(unsigned memory_node)
+{
+	int devid = _starpu_memory_node_to_devid(memory_node);
+
+	STARPU_ASSERT(devid < STARPU_MAXSCCDEVS);
+	return scc_mp_nodes[devid];
+}
+
+static void _starpu_scc_src_init_context(int devid)
+{
+	/* Let's create the node structure, we'll communicate with the peer
+	 * through RCCE thanks to it */
+	scc_mp_nodes[devid] = _starpu_mp_common_node_create(STARPU_SCC_SOURCE, devid);
+}
+
+static void _starpu_scc_src_deinit_context(int devid)
+{
+	_starpu_mp_common_send_command(scc_mp_nodes[devid], STARPU_EXIT, NULL, 0);
+
+	_starpu_mp_common_node_destroy(scc_mp_nodes[devid]);
+}
+
+static int _starpu_scc_src_execute_job(struct _starpu_job *j, struct _starpu_worker *args)
+{
+	int ret;
+	uint32_t mask = 0;
+
+	STARPU_ASSERT(j);
+	struct starpu_task *task = j->task;
+
+	struct timespec codelet_start, codelet_end;
+
+	int profiling = starpu_profiling_status_get();
+	unsigned calibrate_model = 0;
+
+	STARPU_ASSERT(task);
+	struct starpu_codelet *cl = task->cl;
+	STARPU_ASSERT(cl);
+
+	if (cl->model && cl->model->benchmarking)
+		calibrate_model = 1;
+
+	ret = _starpu_fetch_task_input(j, mask);
+	if (ret != 0)
+	{
+		/* there was not enough memory, so the input of
+		 * the codelet cannot be fetched ... put the
+		 * codelet back, and try it later */
+		return -EAGAIN;
+	}
+
+
+	starpu_scc_kernel_t kernel = NULL;
+
+	starpu_scc_func_t func = _starpu_task_get_scc_nth_implementation(j->task->cl, j->nimpl);
+	if (func)
+	{
+		/* We execute the function contained in the codelet, it must return a
+		 * pointer to the function to execute on the device, either specified
+		 * directly by the user or by a call to starpu_scc_get_kernel().
+		 */
+		kernel = func();
+	}
+	else
+	{
+		/* If user doesn't define any starpu_scc_func_t in cl->scc_funcs we try to use
+		 * cpu_funcs_name.
+		 */
+		char *func_name = _starpu_task_get_cpu_name_nth_implementation(j->task->cl, j->nimpl);
+		if (func_name)
+		{
+			starpu_scc_func_symbol_t symbol;
+
+			_starpu_scc_src_register_kernel(&symbol, func_name);
+
+			kernel = _starpu_scc_src_get_kernel(symbol);
+		}
+	}
+	STARPU_ASSERT(kernel);
+
+	_starpu_driver_start_job(args, j, &codelet_start, 0, profiling);
+
+	_starpu_src_common_execute_kernel_from_task(scc_mp_nodes[args->devid], (void (*)(void)) kernel, task);
+
+	_starpu_driver_end_job(args, j, args->perf_arch, &codelet_end, 0, profiling);
+
+	_starpu_driver_update_job_feedback(j, args, args->perf_arch, &codelet_start, &codelet_end, profiling);
+
+	_starpu_push_task_output(j, mask);
+
+
+	return 0;
+}
+
+void _starpu_scc_src_mp_deinit()
+{
+	_starpu_scc_common_unmap_shared_memory();
+	RCCE_finalize();
+}
+
+int _starpu_scc_src_register_kernel(starpu_scc_func_symbol_t *symbol, const char *func_name)
+{
+	unsigned int func_name_size = (strlen(func_name) + 1) * sizeof(char);
+
+	STARPU_PTHREAD_MUTEX_LOCK(&htbl_mutex);
+	struct _starpu_scc_kernel *kernel = _starpu_htbl_search(&kernels_htbl, func_name, func_name_size);
+
+	if (kernel != NULL)
+	{
+		STARPU_PTHREAD_MUTEX_UNLOCK(&htbl_mutex);
+		// Function already in the table.
+		*symbol = kernel;
+		return 0;
+	}
+
+	kernel = malloc(sizeof(*kernel));
+	if (kernel == NULL)
+	{
+		STARPU_PTHREAD_MUTEX_UNLOCK(&htbl_mutex);
+		return -ENOMEM;
+	}
+
+	kernel->name = malloc(func_name_size);
+	if (kernel->name == NULL)
+	{
+		STARPU_PTHREAD_MUTEX_UNLOCK(&htbl_mutex);
+		free(kernel);
+		return -ENOMEM;
+	}
+
+	int ret = _starpu_htbl_insert(&kernels_htbl, func_name, func_name_size, kernel);
+	STARPU_PTHREAD_MUTEX_UNLOCK(&htbl_mutex);
+	if (ret != 0)
+	{
+		free(kernel->name);
+		free(kernel);
+		return -ENOMEM;
+	}
+
+	memcpy(kernel->name, func_name, func_name_size);
+
+	unsigned int nb_scc_devices = starpu_scc_worker_get_count();
+	unsigned int i;
+	for (i = 0; i < nb_scc_devices; ++i)
+		kernel->func[i] = NULL;
+
+	*symbol = kernel;
+
+	return 0;
+}
+
+starpu_scc_kernel_t _starpu_scc_src_get_kernel(starpu_scc_func_symbol_t symbol)
+{
+	int workerid = starpu_worker_get_id();
+	/* This function has to be called in the codelet only, by the thread
+	 * which will handle the task */
+	if (workerid < 0)
+		return NULL;
+
+	int devid = starpu_worker_get_devid(workerid);
+
+	struct _starpu_scc_kernel *kernel = symbol;
+
+	if (kernel->func[devid] == NULL)
+	{
+		struct _starpu_mp_node *node = scc_mp_nodes[devid];
+		int ret = _starpu_src_common_lookup(node, (void (**)(void))&kernel->func[devid], kernel->name);
+
+		if (ret)
+			return NULL;
+	}
+
+	return kernel->func[devid];
+}
+
+unsigned _starpu_scc_src_get_device_count()
+{
+	int nb_scc_devices;
+
+	if (!_starpu_scc_common_is_mp_initialized())
+	{
+		return 0;
+	}
+
+	nb_scc_devices = RCCE_num_ues() - 1;
+	nb_scc_devices = nb_scc_devices < 0 ? 0 : nb_scc_devices;
+
+	return nb_scc_devices;
+}
+
+void _starpu_scc_exit_useless_node(int devid)
+{
+	struct _starpu_mp_node *node = _starpu_mp_common_node_create(STARPU_SCC_SOURCE, devid);
+
+	_starpu_mp_common_send_command(node, STARPU_EXIT, NULL, 0);
+
+	_starpu_mp_common_node_destroy(node);
+}
+
+void _starpu_scc_src_init(struct _starpu_mp_node *node)
+{
+	node->mp_connection.scc_nodeid = STARPU_TO_SCC_SINK_ID(node->peer_id);
+}
+
+/* Allocate memory on SCC.
+ * Return 0 if OK or 1 if not.
+ */
+int _starpu_scc_allocate_memory(void **addr, size_t size, unsigned memory_node)
+{
+	return _starpu_src_common_allocate(_starpu_scc_src_memory_node_to_mp_node(memory_node),
+			addr, size);
+}
+
+/* Free memory on SCC.
+ */
+void _starpu_scc_free_memory(void *addr, unsigned memory_node)
+{
+	return _starpu_src_common_free(_starpu_scc_src_memory_node_to_mp_node(memory_node),
+			addr);
+}
+
+int _starpu_scc_allocate_shared_memory(void **addr, size_t size)
+{
+	return (*addr = (void*)RCCE_shmalloc(size)) == NULL;
+}
+
+void _starpu_scc_free_shared_memory(void *addr)
+{
+	RCCE_shfree(addr);
+}
+
+/* Assigns the offset to "offset" between "ptr" and the start of the shared memory.
+ * Affect "dev_handle" with the start of the shared memory is useful for data
+ * partionning.
+ */
+void _starpu_scc_set_offset_in_shared_memory(void *ptr, void **dev_handle, size_t *offset)
+{
+	/* We're on SCC... */
+	if (_starpu_can_submit_scc_task())
+	{
+		if (!_starpu_scc_common_is_in_shared_memory(ptr))
+		{
+			fprintf(stderr, "The data (%p) you want to register does not seem to be allocated in shared memory. "
+					"Please use starpu_malloc to do this.\n", ptr);
+			STARPU_ABORT();
+		}
+
+		void *shm_addr = _starpu_scc_common_get_shared_memory_addr();
+
+		if (dev_handle)
+			*dev_handle = shm_addr;
+
+		if (offset)
+			*offset = ptr - shm_addr;
+	}
+}
+
+/* Transfert SIZE bytes from the address pointed by SRC in the SRC_NODE memory
+ * node to the address pointed by DST in the DST_NODE memory node
+ */
+int _starpu_scc_copy_src_to_sink(void *src, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst, unsigned dst_node, size_t size)
+{
+	return _starpu_src_common_copy_host_to_sink(_starpu_scc_src_memory_node_to_mp_node(dst_node),
+			src, dst, size);
+}
+
+/* Transfert SIZE bytes from the address pointed by SRC in the SRC_NODE memory
+ * node to the address pointed by DST in the DST_NODE memory node
+ */
+int _starpu_scc_copy_sink_to_src(void *src, unsigned src_node, void *dst, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, size_t size)
+{
+	return _starpu_src_common_copy_sink_to_host(_starpu_scc_src_memory_node_to_mp_node(src_node),
+			src, dst, size);
+}
+
+int _starpu_scc_copy_sink_to_sink(void *src, unsigned src_node, void *dst, unsigned dst_node, size_t size)
+{
+	return _starpu_src_common_copy_sink_to_sink(_starpu_scc_src_memory_node_to_mp_node(src_node),
+			_starpu_scc_src_memory_node_to_mp_node(dst_node),
+			src, dst, size);
+}
+
+void *_starpu_scc_src_worker(void *arg)
+{
+	struct _starpu_worker *args = arg;
+
+	int devid = args->devid;
+	int workerid = args->workerid;
+	unsigned memnode = args->memory_node;
+
+	_starpu_worker_init(args, _STARPU_FUT_SCC_KEY);
+
+	_starpu_scc_src_init_context(devid);
+
+	args->status = STATUS_UNKNOWN;
+
+	_STARPU_TRACE_WORKER_INIT_END
+
+	/* tell the main thread that this one is ready */
+	_STARPU_STARPU_PTHREAD_MUTEX_LOCK(&args->mutex);
+	args->worker_is_initialized = 1;
+	_STARPU_PTHREAD_COND_SIGNAL(&args->ready_cond);
+	_STARPU_STARPU_PTHREAD_MUTEX_UNLOCK(&args->mutex);
+
+	struct _starpu_job * j;
+	struct starpu_task *task;
+	int res;
+
+	while (_starpu_machine_is_running())
+	{
+		_STARPU_TRACE_START_PROGRESS(memnode);
+		_starpu_datawizard_progress(memnode, 1);
+		_STARPU_TRACE_END_PROGRESS(memnode);
+
+		task = _starpu_get_worker_task(args, workerid, memnode);
+		if (!task)
+			continue;
+
+		j = _starpu_get_job_associated_to_task(task);
+
+		/* can a SCC device do that task ? */
+		if (!_STARPU_SCC_MAY_PERFORM(j))
+		{
+			/* this isn't a SCC task */
+			_starpu_push_task_to_workers(j);
+			continue;
+		}
+
+		_starpu_set_current_task(task);
+		args->current_task = j->task;
+
+		res = _starpu_scc_src_execute_job(j, args);
+
+		_starpu_set_current_task(NULL);
+		args->current_task = NULL;
+
+		if (res)
+		{
+			switch (res)
+			{
+				case -EAGAIN:
+					_STARPU_DISP("ouch, put the codelet %p back ... \n", j);
+					_starpu_push_task(j);
+					STARPU_ABORT();
+					continue;
+				default:
+					STARPU_ASSERT(0);
+			}
+		}
+
+		_starpu_handle_job_termination(j);
+	}
+
+	_STARPU_TRACE_WORKER_DEINIT_START
+
+	_starpu_handle_all_pending_node_data_requests(memnode);
+
+	/* In case there remains some memory that was automatically
+	 * allocated by StarPU, we release it now. Note that data
+	 * coherency is not maintained anymore at that point ! */
+	_starpu_free_all_automatically_allocated_buffers(memnode);
+
+	_starpu_scc_src_deinit_context(args->devid);
+
+	_STARPU_TRACE_WORKER_DEINIT_END(_STARPU_FUT_SCC_KEY);
+
+	return NULL;
+}

+ 56 - 0
src/drivers/scc/driver_scc_source.h

@@ -0,0 +1,56 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012  Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+#ifndef __DRIVER_SCC_SOURCE_H__
+#define __DRIVER_SCC_SOURCE_H__
+
+#include <starpu.h>
+#include <starpu_scc.h>
+#include <common/config.h>
+
+
+#ifdef STARPU_USE_SCC
+
+#include <drivers/mp_common/mp_common.h>
+
+
+void _starpu_scc_src_mp_deinit();
+
+int _starpu_scc_src_register_kernel(starpu_scc_func_symbol_t *symbol, const char *func_name);
+starpu_scc_kernel_t _starpu_scc_src_get_kernel(starpu_scc_func_symbol_t symbol);
+
+unsigned _starpu_scc_src_get_device_count();
+void _starpu_scc_exit_useless_node(int devid);
+
+void _starpu_scc_src_init(struct _starpu_mp_node *node);
+
+int _starpu_scc_allocate_memory(void **addr, size_t size, unsigned memory_node);
+void _starpu_scc_free_memory(void *addr, unsigned memory_node);
+int _starpu_scc_allocate_shared_memory(void **addr, size_t size);
+void _starpu_scc_free_shared_memory(void *addr);
+
+void _starpu_scc_set_offset_in_shared_memory(void *ptr, void **dev_handle, size_t *offset);
+
+int _starpu_scc_copy_src_to_sink(void *src, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst, unsigned dst_node, size_t size);
+int _starpu_scc_copy_sink_to_src(void *src, unsigned src_node, void *dst, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, size_t size);
+int _starpu_scc_copy_sink_to_sink(void *src, unsigned src_node, void *dst, unsigned dst_node, size_t size);
+
+void *_starpu_scc_src_worker(void *arg);
+
+#endif /* STARPU_USE_SCC */
+
+
+#endif /* __DRIVER_SCC_SOURCE_H__ */

+ 45 - 0
src/drivers/scc/driver_scc_utils.c

@@ -0,0 +1,45 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012  Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+
+#include <starpu.h>
+#include <starpu_scc.h>
+#include <common/config.h>
+
+#include <drivers/mp_common/source_common.h>
+#include <drivers/scc/driver_scc_source.h>
+
+
+/* Initiate a lookup on each SCC device to find the adress of the function
+ * named FUNC_NAME, store them in the global array kernels and return
+ * the index in the array through SYMBOL.
+ * If success, returns 0. If the user has registered too many kernels (more
+ * than STARPU_MAXSCCDEVS) returns -ENOMEM
+ */
+int starpu_scc_register_kernel(starpu_scc_func_symbol_t *symbol,
+			       const char *func_name)
+{
+	return _starpu_scc_src_register_kernel(symbol, func_name);
+}
+
+/* If success, return the pointer to the function defined by SYMBOL on the
+ * device linked to the called 
+ * device.
+ */
+starpu_scc_kernel_t starpu_scc_get_kernel(starpu_scc_func_symbol_t symbol)
+{
+	return _starpu_scc_src_get_kernel(symbol);
+}

+ 6 - 0
src/top/starpu_top.c

@@ -105,6 +105,12 @@ static void starpu_top_get_device_type(int id, char* type)
 	case STARPU_ANY_WORKER:
 	case STARPU_ANY_WORKER:
 		strncpy(type, "ANY",9);
 		strncpy(type, "ANY",9);
 		break;
 		break;
+	case STARPU_MIC_WORKER:
+		strncpy(type, "MIC", 9);
+		break;
+	case STARPU_SCC_WORKER:
+		strncpy(type, "SCC", 9);
+		break;
 	}
 	}
 }
 }
 
 

+ 3 - 0
src/util/execute_on_all.c

@@ -110,6 +110,9 @@ void starpu_execute_on_each_worker_ex(void (*func)(void *), void *arg, uint32_t
 	unsigned nworkers = starpu_worker_get_count();
 	unsigned nworkers = starpu_worker_get_count();
 	struct starpu_task *tasks[STARPU_NMAXWORKERS];
 	struct starpu_task *tasks[STARPU_NMAXWORKERS];
 
 
+	/* This method only work on CPU, CUDA, OPENCL */
+	STARPU_ASSERT((where & ~STARPU_CPU & ~STARPU_CUDA & ~STARPU_OPENCL) == 0);
+
 	/* create a wrapper codelet */
 	/* create a wrapper codelet */
 	struct starpu_codelet wrapper_cl =
 	struct starpu_codelet wrapper_cl =
 	{
 	{

+ 61 - 8
src/util/starpu_data_cpy.c

@@ -19,10 +19,15 @@
 #include <core/task.h>
 #include <core/task.h>
 #include <datawizard/datawizard.h>
 #include <datawizard/datawizard.h>
 #include <util/starpu_data_cpy.h>
 #include <util/starpu_data_cpy.h>
+#include <starpu_mic.h>
+#include <starpu_scc.h>
 
 
-static void data_cpy_func(void *descr[], void *cl_arg)
+static void common_data_cpy_func(void *descr[], void *cl_arg)
 {
 {
-	const struct starpu_data_copy_methods *copy_methods = (const struct starpu_data_copy_methods *) cl_arg;
+	unsigned interface_id = *(unsigned *)cl_arg;
+
+	const struct starpu_data_interface_ops *interface_ops = _starpu_data_interface_get_ops(interface_id);
+	const struct starpu_data_copy_methods *copy_methods = interface_ops->copy_methods;
 
 
 	int workerid = starpu_worker_get_id();
 	int workerid = starpu_worker_get_id();
 	enum starpu_worker_archtype type = starpu_worker_get_type(workerid);
 	enum starpu_worker_archtype type = starpu_worker_get_type(workerid);
@@ -63,6 +68,48 @@ static void data_cpy_func(void *descr[], void *cl_arg)
 
 
 }
 }
 
 
+void mp_cpy_kernel(void *descr[], void *cl_arg)
+{
+	unsigned interface_id = *(unsigned *)cl_arg;
+
+	const struct starpu_data_interface_ops *interface_ops = _starpu_data_interface_get_ops(interface_id);
+	const struct starpu_data_copy_methods *copy_methods = interface_ops->copy_methods;
+	
+	void *dst_interface = descr[0];
+	void *src_interface = descr[1];
+
+	STARPU_ASSERT(copy_methods->ram_to_ram);
+	copy_methods->ram_to_ram(src_interface, 0, dst_interface, 0);
+}
+
+static starpu_mic_kernel_t mic_cpy_func()
+{
+#ifdef STARPU_USE_MIC
+	static starpu_mic_func_symbol_t mic_symbol = NULL;
+	if (mic_symbol == NULL)
+		starpu_mic_register_kernel(&mic_symbol, "mp_cpy_kernel");
+
+	return starpu_mic_get_kernel(mic_symbol);
+#else
+	STARPU_ABORT();
+	return NULL;
+#endif
+}
+
+static starpu_scc_kernel_t scc_cpy_func()
+{
+#ifdef STARPU_USE_SCC
+	static starpu_scc_func_symbol_t scc_symbol = NULL;
+	if (scc_symbol == NULL)
+		starpu_scc_register_kernel(&scc_symbol, "mp_cpy_kernel");
+
+	return starpu_scc_get_kernel(scc_symbol);
+#else
+	STARPU_ABORT();
+	return NULL;
+#endif
+}
+
 struct starpu_perfmodel copy_model =
 struct starpu_perfmodel copy_model =
 {
 {
 	.type = STARPU_HISTORY_BASED,
 	.type = STARPU_HISTORY_BASED,
@@ -71,10 +118,12 @@ struct starpu_perfmodel copy_model =
 
 
 static struct starpu_codelet copy_cl =
 static struct starpu_codelet copy_cl =
 {
 {
-	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL,
-	.cpu_funcs = {data_cpy_func, NULL},
-	.cuda_funcs = {data_cpy_func, NULL},
-	.opencl_funcs = {data_cpy_func, NULL},
+	.where = STARPU_CPU|STARPU_CUDA|STARPU_OPENCL|STARPU_MIC|STARPU_SCC,
+	.cpu_funcs = {common_data_cpy_func, NULL},
+	.cuda_funcs = {common_data_cpy_func, NULL},
+	.opencl_funcs = {common_data_cpy_func, NULL},
+	.mic_funcs = {mic_cpy_func, NULL},
+	.scc_funcs = {scc_cpy_func, NULL},
 	.nbuffers = 2,
 	.nbuffers = 2,
 	.modes = {STARPU_W, STARPU_R},
 	.modes = {STARPU_W, STARPU_R},
 	.model = &copy_model
 	.model = &copy_model
@@ -84,7 +133,6 @@ int _starpu_data_cpy(starpu_data_handle_t dst_handle, starpu_data_handle_t src_h
 		     int asynchronous, void (*callback_func)(void*), void *callback_arg,
 		     int asynchronous, void (*callback_func)(void*), void *callback_arg,
 		     int reduction, struct starpu_task *reduction_dep_task)
 		     int reduction, struct starpu_task *reduction_dep_task)
 {
 {
-	const struct starpu_data_copy_methods *copy_methods = dst_handle->ops->copy_methods;
 
 
 	struct starpu_task *task = starpu_task_create();
 	struct starpu_task *task = starpu_task_create();
 	STARPU_ASSERT(task);
 	STARPU_ASSERT(task);
@@ -98,7 +146,12 @@ int _starpu_data_cpy(starpu_data_handle_t dst_handle, starpu_data_handle_t src_h
 	}
 	}
 
 
 	task->cl = &copy_cl;
 	task->cl = &copy_cl;
-	task->cl_arg = (void *)copy_methods;
+
+	unsigned *interface_id = malloc(sizeof(*interface_id));
+	*interface_id = dst_handle->ops->interfaceid; 
+	task->cl_arg = interface_id;
+	task->cl_arg_size = sizeof(*interface_id);
+	task->cl_arg_free = 1;
 
 
 	task->callback_func = callback_func;
 	task->callback_func = callback_func;
 	task->callback_arg = callback_arg;
 	task->callback_arg = callback_arg;

+ 1 - 1
src/util/starpu_insert_task.c

@@ -60,7 +60,6 @@ void starpu_codelet_unpack_args(void *_cl_arg, ...)
 	}
 	}
 
 
 	va_end(varg_list);
 	va_end(varg_list);
-	free(cl_arg);
 }
 }
 
 
 int starpu_insert_task(struct starpu_codelet *cl, ...)
 int starpu_insert_task(struct starpu_codelet *cl, ...)
@@ -80,6 +79,7 @@ int starpu_insert_task(struct starpu_codelet *cl, ...)
 	}
 	}
 
 
 	struct starpu_task *task = starpu_task_create();
 	struct starpu_task *task = starpu_task_create();
+	task->cl_arg_free = 1;
 
 
 	if (cl && cl->nbuffers > STARPU_NMAXBUFS)
 	if (cl && cl->nbuffers > STARPU_NMAXBUFS)
 	{
 	{

+ 35 - 0
starpu-1.0-mic.pc.in

@@ -0,0 +1,35 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2009, 2010, 2011, 2013  Université de Bordeaux 1
+# Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
+prefix=@prefix@
+exec_prefix=@exec_prefix@
+libdir=@libdir@
+pkglibdir=@pkglibdir@
+includedir=@includedir@
+
+# When the GCC plug-in is available, the following lines indicate
+# where it is installed.
+@GCC_PLUGIN_DIR_PKGCONFIG@
+@GCC_PLUGIN_PKGCONFIG@
+
+Name: starpu
+Description: offers support for heterogeneous multicore architecture
+Version: @PACKAGE_VERSION@
+Cflags: -I${includedir}/starpu/@STARPU_EFFECTIVE_VERSION@ @STARPU_CUDA_CPPFLAGS@
+Libs: @STARPU_EXPORT_DYNAMIC@ -L${libdir} -lstarpu-@STARPU_EFFECTIVE_VERSION@ @STARPU_CUDA_LDFLAGS@ @STARPU_OPENCL_LDFLAGS@
+Libs.private: @LDFLAGS@ @LIBS@
+Requires: @HWLOC_REQUIRES@
+Requires.private: @GORDON_REQUIRES@

+ 1 - 1
starpu-1.0.pc.in

@@ -29,6 +29,6 @@ Name: starpu
 Description: offers support for heterogeneous multicore architecture
 Description: offers support for heterogeneous multicore architecture
 Version: @PACKAGE_VERSION@
 Version: @PACKAGE_VERSION@
 Cflags: -I${includedir}/starpu/@STARPU_EFFECTIVE_VERSION@ @STARPU_CUDA_CPPFLAGS@ -DSTARPU_USE_DEPRECATED_ONE_ZERO_API
 Cflags: -I${includedir}/starpu/@STARPU_EFFECTIVE_VERSION@ @STARPU_CUDA_CPPFLAGS@ -DSTARPU_USE_DEPRECATED_ONE_ZERO_API
-Libs: -L${libdir} -lstarpu-@STARPU_EFFECTIVE_VERSION@ @STARPU_OPENCL_LDFLAGS@ @STARPU_CUDA_LDFLAGS@ @STARPU_SC_HYPERVISOR@
+Libs: @STARPU_EXPORT_DYNAMIC@ -L${libdir} -lstarpu-@STARPU_EFFECTIVE_VERSION@ @STARPU_OPENCL_LDFLAGS@ @STARPU_CUDA_LDFLAGS@ @STARPU_SC_HYPERVISOR@
 Libs.private: @LDFLAGS@ @LIBS@ @LIBSTARPU_LDFLAGS@
 Libs.private: @LDFLAGS@ @LIBS@ @LIBSTARPU_LDFLAGS@
 Requires: @HWLOC_REQUIRES@
 Requires: @HWLOC_REQUIRES@

+ 1 - 1
starpu-1.1.pc.in

@@ -29,6 +29,6 @@ Name: starpu
 Description: offers support for heterogeneous multicore architecture
 Description: offers support for heterogeneous multicore architecture
 Version: @PACKAGE_VERSION@
 Version: @PACKAGE_VERSION@
 Cflags: -I${includedir}/starpu/@STARPU_EFFECTIVE_VERSION@ @STARPU_CUDA_CPPFLAGS@
 Cflags: -I${includedir}/starpu/@STARPU_EFFECTIVE_VERSION@ @STARPU_CUDA_CPPFLAGS@
-Libs: -L${libdir} -lstarpu-@STARPU_EFFECTIVE_VERSION@ @STARPU_OPENCL_LDFLAGS@ @STARPU_CUDA_LDFLAGS@ @STARPU_SC_HYPERVISOR@
+Libs: @STARPU_EXPORT_DYNAMIC@ -L${libdir} -lstarpu-@STARPU_EFFECTIVE_VERSION@ @STARPU_OPENCL_LDFLAGS@ @STARPU_CUDA_LDFLAGS@ @STARPU_SC_HYPERVISOR@
 Libs.private: @LDFLAGS@ @LIBS@ @LIBSTARPU_LDFLAGS@
 Libs.private: @LDFLAGS@ @LIBS@ @LIBSTARPU_LDFLAGS@
 Requires: @HWLOC_REQUIRES@
 Requires: @HWLOC_REQUIRES@

+ 74 - 0
super-configure

@@ -0,0 +1,74 @@
+#!/bin/sh
+
+ROOT_DIR=$PWD
+MIC_HOST=x86_64-k1om-linux
+MIC_CC_PATH=/usr/linux-k1om-4.7/bin/
+DEFAULT_PREFIX=/usr/local
+
+export PATH=${MIC_CC_PATH}${PATH:+:${PATH}}
+
+echo "This file was created by StarPU super-configure 0.0.1." > ./super-config.log
+echo "" >> ./super-config.log
+echo " $ $0 $*" >> ./super-config.log
+
+for arch in mic host
+do
+
+	# We call the configure script from a build directory further in the
+	# arborescence
+	command="${ROOT_DIR}/configure --enable-mic --with-coi-dir=/opt/intel/mic/coi"
+	prefix_found=no
+
+	if test x$arch = xmic ; then
+		command="$command --without-hwloc --with-coi-lib-dir=/opt/intel/mic/coi/device-linux-release/lib --host=$MIC_HOST"
+	else
+		command="$command --with-coi-lib-dir=/opt/intel/mic/coi/host-linux-release/lib"
+	fi
+
+	for arg in $*
+	do
+		if [ ${arg:0:9} = '--prefix=' ]
+		then
+			prefix_found=yes
+			prefix="${arg:9}"
+			command="$command ${arg}/${arch}"
+		else
+			command="$command $arg"
+		fi
+
+	done
+
+	# If the user didn't specify a directory where to install the library
+	# we apply the default one
+	if test x$prefix_found = xno ; then
+		command="$command --prefix=${DEFAULT_PREFIX}/$arch"
+		prefix=${DEFAULT_PREFIX}
+	fi
+
+	# If the build directory doesn't exist yet, create it
+	if [ ! -d "${ROOT_DIR}/build_${arch}" ] ; then
+		mkdir "build_${arch}"
+	fi
+
+	cd "build_${arch}"
+
+	if test x$arch = xmic ; then
+		LDFLAGS=-export-dynamic $command
+	else
+		$command
+	fi
+	make -j
+
+	if test x$arch = xmic ; then
+		make check > /dev/null 2&>1
+	fi
+
+	make install
+	cd "${ROOT_DIR}"
+
+done
+
+if [ ! -f "${prefix}/mic/lib/pkgconfig/starpu-1.0-mic.pc" ]
+then
+	ln -s "${prefix}/mic/lib/pkgconfig/starpu-1.0.pc" "${prefix}/mic/lib/pkgconfig/starpu-1.0-mic.pc"
+fi

+ 1 - 0
tools/Makefile.am

@@ -19,6 +19,7 @@ SUBDIRS =
 AM_CFLAGS = $(HWLOC_CFLAGS) $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(GLOBAL_AM_CFLAGS)
 AM_CFLAGS = $(HWLOC_CFLAGS) $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(GLOBAL_AM_CFLAGS)
 LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ @LIBS@
 LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ @LIBS@
 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/tools/ -I$(top_srcdir)/mpi/ -I$(top_builddir)/src -I$(top_srcdir)/src
 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/tools/ -I$(top_srcdir)/mpi/ -I$(top_builddir)/src -I$(top_srcdir)/src
+AM_LDFLAGS = $(STARPU_COI_LDFLAGS)
 
 
 bin_PROGRAMS =
 bin_PROGRAMS =
 dist_bin_SCRIPTS =
 dist_bin_SCRIPTS =

+ 10 - 0
tools/starpu_machine_display.c

@@ -141,6 +141,11 @@ int main(int argc, char **argv)
 	unsigned ncuda = starpu_cuda_worker_get_count();
 	unsigned ncuda = starpu_cuda_worker_get_count();
 	unsigned nopencl = starpu_opencl_worker_get_count();
 	unsigned nopencl = starpu_opencl_worker_get_count();
 
 
+#ifdef STARPU_USE_MIC
+	unsigned nmicdevs = starpu_mic_device_get_count();
+	unsigned nmiccores = starpu_mic_worker_get_count();
+#endif
+
 	fprintf(stdout, "StarPU has found :\n");
 	fprintf(stdout, "StarPU has found :\n");
 
 
 	fprintf(stdout, "\t%u CPU cores\n", ncpu);
 	fprintf(stdout, "\t%u CPU cores\n", ncpu);
@@ -152,6 +157,11 @@ int main(int argc, char **argv)
 	fprintf(stdout, "\t%u OpenCL devices\n", nopencl);
 	fprintf(stdout, "\t%u OpenCL devices\n", nopencl);
 	display_worker_names(STARPU_OPENCL_WORKER);
 	display_worker_names(STARPU_OPENCL_WORKER);
 
 
+#ifdef STARPU_USE_MIC
+	fprintf(stdout, "\t%d MIC cores (from %d devices)\n", nmiccores, nmicdevs);
+	display_worker_names(STARPU_MIC_WORKER);
+#endif
+
 	display_all_combined_workers();
 	display_all_combined_workers();
 
 
 	fprintf(stdout, "\ntopology ...\n");
 	fprintf(stdout, "\ntopology ...\n");