Sfoglia il codice sorgente

merge trunk up to 9949

Simon Archipoff 12 anni fa
parent
commit
f01e362c0a
100 ha cambiato i file con 3349 aggiunte e 340 eliminazioni
  1. 2 0
      AUTHORS
  2. 4 0
      ChangeLog
  3. 3 0
      Makefile.am
  4. 298 1
      configure.ac
  5. 11 0
      doc/chapters/advanced-examples.texi
  6. 217 8
      doc/chapters/api.texi
  7. 3 0
      doc/chapters/basic-examples.texi
  8. 4 0
      doc/chapters/configuration.texi
  9. 55 0
      doc/chapters/mic-scc-support.texi
  10. 4 1
      doc/chapters/perf-optimization.texi
  11. 3 1
      doc/chapters/sc_hypervisor.texi
  12. 2 1
      doc/chapters/vector_scal_c.texi
  13. 9 0
      doc/starpu.texi
  14. 9 1
      examples/Makefile.am
  15. 3 2
      examples/basic_examples/mult.c
  16. 17 4
      examples/basic_examples/vector_scal.c
  17. 2 1
      examples/basic_examples/vector_scal_c.c
  18. 2 0
      examples/filters/fblock.c
  19. 1 0
      examples/filters/fmatrix.c
  20. 1 0
      examples/filters/fvector.c
  21. 2 1
      examples/filters/shadow.c
  22. 2 1
      examples/filters/shadow2d.c
  23. 2 1
      examples/filters/shadow3d.c
  24. 1 0
      examples/incrementer/incrementer.c
  25. 2 0
      examples/interface/complex_codelet.h
  26. 2 1
      examples/openmp/vector_scal_omp.c
  27. 2 1
      examples/pi/pi.c
  28. 7 3
      examples/pi/pi_redux.c
  29. 3 2
      examples/ppm_downscaler/yuv_downscaler.c
  30. 3 0
      examples/reductions/dot_product.c
  31. 4 1
      examples/reductions/minmax_reduction.c
  32. 1 0
      examples/spmd/vector_scal_spmd.c
  33. 1 1
      examples/stencil/Makefile.am
  34. 7 4
      examples/stencil/stencil-kernels.c
  35. 3 2
      examples/stencil/stencil-tasks.c
  36. 25 0
      include/starpu.h
  37. 5 0
      include/starpu_config.h.in
  38. 11 2
      include/starpu_data.h
  39. 39 2
      include/starpu_data_interfaces.h
  40. 35 0
      include/starpu_mic.h
  41. 10 4
      include/starpu_perfmodel.h
  42. 35 0
      include/starpu_scc.h
  43. 1 1
      include/starpu_sched_ctx.h
  44. 23 0
      include/starpu_sink.h
  45. 11 0
      include/starpu_task.h
  46. 12 12
      include/starpu_task_util.h
  47. 1 1
      include/starpu_util.h
  48. 41 6
      include/starpu_worker.h
  49. 28 0
      libstarpu-mic.pc.in
  50. 1 1
      libstarpu.pc.in
  51. 90 0
      mic-configure
  52. 47 3
      src/Makefile.am
  53. 2 0
      src/common/fxt.h
  54. 8 2
      src/common/utils.c
  55. 11 5
      src/common/utils.h
  56. 3 2
      src/core/dependencies/data_concurrency.c
  57. 100 130
      src/core/dependencies/implicit_data_deps.c
  58. 6 0
      src/core/jobs.h
  59. 54 5
      src/core/perfmodel/perfmodel_bus.c
  60. 45 16
      src/core/perfmodel/perfmodel_history.c
  61. 2 1
      src/core/sched_ctx.c
  62. 24 2
      src/core/sched_policy.c
  63. 36 0
      src/core/task.c
  64. 4 0
      src/core/task.h
  65. 423 13
      src/core/topology.c
  66. 1 1
      src/core/topology.h
  67. 261 10
      src/core/workers.c
  68. 26 1
      src/core/workers.h
  69. 8 0
      src/datawizard/coherency.c
  70. 10 7
      src/datawizard/coherency.h
  71. 128 0
      src/datawizard/copy_driver.c
  72. 15 0
      src/datawizard/copy_driver.h
  73. 4 1
      src/datawizard/data_request.h
  74. 5 5
      src/datawizard/filters.c
  75. 1 0
      src/datawizard/interfaces/bcsr_filters.c
  76. 4 2
      src/datawizard/interfaces/bcsr_interface.c
  77. 1 0
      src/datawizard/interfaces/block_filters.c
  78. 197 2
      src/datawizard/interfaces/block_interface.c
  79. 4 2
      src/datawizard/interfaces/coo_interface.c
  80. 1 0
      src/datawizard/interfaces/csr_filters.c
  81. 6 2
      src/datawizard/interfaces/csr_interface.c
  82. 54 6
      src/datawizard/interfaces/data_interface.c
  83. 26 1
      src/datawizard/interfaces/data_interface.h
  84. 2 0
      src/datawizard/interfaces/matrix_filters.c
  85. 172 0
      src/datawizard/interfaces/matrix_interface.c
  86. 118 5
      src/datawizard/interfaces/multiformat_interface.c
  87. 20 4
      src/datawizard/interfaces/variable_interface.c
  88. 4 0
      src/datawizard/interfaces/vector_filters.c
  89. 10 2
      src/datawizard/interfaces/vector_interface.c
  90. 2 2
      src/datawizard/interfaces/void_interface.c
  91. 35 0
      src/datawizard/malloc.c
  92. 22 32
      src/datawizard/memalloc.c
  93. 1 1
      src/datawizard/memory_manager.c
  94. 35 7
      src/datawizard/reduction.c
  95. 24 0
      src/debug/traces/starpu_fxt.c
  96. 2 1
      src/drivers/gordon/driver_gordon.c
  97. 120 0
      src/drivers/mic/driver_mic_common.c
  98. 70 0
      src/drivers/mic/driver_mic_common.h
  99. 135 0
      src/drivers/mic/driver_mic_sink.c
  100. 0 0
      src/drivers/mic/driver_mic_sink.h

+ 2 - 0
AUTHORS

@@ -10,6 +10,8 @@ David Gómez <david_gomez1380@yahoo.com.mx>
 Sylvain Henry <sylvain.henry@inria.fr>
 Sylvain Henry <sylvain.henry@inria.fr>
 Mehdi Juhoor <mjuhoor@gmail.com>
 Mehdi Juhoor <mjuhoor@gmail.com>
 Antoine Lucas <antoine.lucas.33@gmail.com>
 Antoine Lucas <antoine.lucas.33@gmail.com>
+Brice Mortier <brice.mortier@etu.u-bordeaux1.fr>
+Damien Pasqualinotto <dam.pasqualinotto@wanadoo.fr>
 Nguyen Quôc-Dinh <nguyen.quocdinh@gmail.com>
 Nguyen Quôc-Dinh <nguyen.quocdinh@gmail.com>
 Cyril Roelandt <cyril.roelandt@inria.fr>
 Cyril Roelandt <cyril.roelandt@inria.fr>
 Anthony Roy <theanthony33@gmail.com>
 Anthony Roy <theanthony33@gmail.com>

+ 4 - 0
ChangeLog

@@ -18,6 +18,8 @@ StarPU 1.2.0 (svn revision xxxx)
 ==============================================
 ==============================================
 
 
 New features:
 New features:
+  * Xeon Phi support
+  * SCC support
   * New function starpu_sched_ctx_exec_parallel_code to execute a
   * New function starpu_sched_ctx_exec_parallel_code to execute a
     parallel code on the workers of the given scheduler context
     parallel code on the workers of the given scheduler context
   * MPI:
   * MPI:
@@ -28,6 +30,8 @@ New features:
 	  before the corresponding data, which allows the receiver to
 	  before the corresponding data, which allows the receiver to
 	  allocate data correctly, and to submit the matching receive of
 	  allocate data correctly, and to submit the matching receive of
 	  the envelope.
 	  the envelope.
+  * New STARPU_COMMUTE flag which can be passed along STARPU_W or STARPU_RW to
+    let starpu commute write accesses.
 
 
 Small features:
 Small features:
   * Add cl_arg_free field to enable automatic free(cl_arg) on task
   * Add cl_arg_free field to enable automatic free(cl_arg) on task

+ 3 - 0
Makefile.am

@@ -67,6 +67,9 @@ versinclude_HEADERS = 				\
 	include/starpu_fxt.h			\
 	include/starpu_fxt.h			\
 	include/starpu_cuda.h			\
 	include/starpu_cuda.h			\
 	include/starpu_opencl.h			\
 	include/starpu_opencl.h			\
+	include/starpu_sink.h			\
+	include/starpu_mic.h			\
+	include/starpu_scc.h			\
 	include/starpu_expert.h			\
 	include/starpu_expert.h			\
 	include/starpu_profiling.h		\
 	include/starpu_profiling.h		\
 	include/starpu_bound.h			\
 	include/starpu_bound.h			\

+ 298 - 1
configure.ac

@@ -100,6 +100,31 @@ else
    LC_ALL=C svnversion $srcdir > ./STARPU-REVISION
    LC_ALL=C svnversion $srcdir > ./STARPU-REVISION
 fi
 fi
 
 
+AM_CONDITIONAL([STARPU_CROSS_COMPILING], [test "x$cross_compiling" = "xyes"])
+
+###############################################################################
+#                                                                             #
+#                           MIC device compilation                            #
+#   (Must be done in beginning to change prefix in the whole configuration)   #
+#                                                                             #
+###############################################################################
+
+AC_ARG_ENABLE(mic, [AS_HELP_STRING([--enable-mic],
+	      [use MIC device(s)])], [enable_mic=yes], [enable_mic=no])
+AC_ARG_ENABLE(mic-rma, [AS_HELP_STRING([--disable-mic-rma],
+	      [use MIC RMA transfer])], [], [enable_mic_rma=yes])
+
+if test x$enable_mic = xyes ; then
+	AC_DEFINE(STARPU_USE_MIC, [1], [MIC workers support is enabled])
+fi
+if test x$enable_mic_rma = xyes ; then
+	AC_DEFINE([STARPU_MIC_USE_RMA], [1], [MIC RMA transfer is enable])
+fi
+
+AM_CONDITIONAL([STARPU_USE_MIC], [test "x$enable_mic" = "xyes"])
+
+###############################################################################
+
 AC_PATH_PROGS([STARPU_MS_LIB], [lib])
 AC_PATH_PROGS([STARPU_MS_LIB], [lib])
 AC_ARG_VAR([STARPU_MS_LIB], [Path to Microsoft's Visual Studio `lib' tool])
 AC_ARG_VAR([STARPU_MS_LIB], [Path to Microsoft's Visual Studio `lib' tool])
 AM_CONDITIONAL([STARPU_HAVE_MS_LIB], [test "x$STARPU_MS_LIB" != "x"])
 AM_CONDITIONAL([STARPU_HAVE_MS_LIB], [test "x$STARPU_MS_LIB" != "x"])
@@ -826,6 +851,19 @@ if test x$disable_asynchronous_opencl_copy = xyes ; then
    AC_DEFINE([STARPU_DISABLE_ASYNCHRONOUS_OPENCL_COPY], [1], [Define to 1 to disable asynchronous copy between CPU and OpenCL devices])
    AC_DEFINE([STARPU_DISABLE_ASYNCHRONOUS_OPENCL_COPY], [1], [Define to 1 to disable asynchronous copy between CPU and OpenCL devices])
 fi
 fi
 
 
+AC_MSG_CHECKING(whether asynchronous MIC copy should be disabled)
+AC_ARG_ENABLE(asynchronous-mic-copy, [AS_HELP_STRING([--disable-asynchronous-mic-copy],
+			[disable asynchronous copy between CPU and MIC devices])],
+			enable_asynchronous_mic_copy=$enableval, enable_asynchronous_mic_copy=yes)
+disable_asynchronous_mic_copy=no
+if test x$enable_asynchronous_mic_copy = xno ; then
+   disable_asynchronous_mic_copy=yes
+fi
+AC_MSG_RESULT($disable_asynchronous_mic_copy)
+if test x$disable_asynchronous_mic_copy = xyes ; then
+   AC_DEFINE([STARPU_DISABLE_ASYNCHRONOUS_MIC_COPY], [1], [Define to 1 to disable asynchronous copy between CPU and MIC devices])
+fi
+
 ###############################################################################
 ###############################################################################
 #                                                                             #
 #                                                                             #
 #                                 Drivers                                     #
 #                                 Drivers                                     #
@@ -880,6 +918,252 @@ if test x$enable_blocking = xno -a x$enable_simgrid != xyes ; then
 	AC_DEFINE(STARPU_NON_BLOCKING_DRIVERS, [1], [drivers must progress])
 	AC_DEFINE(STARPU_NON_BLOCKING_DRIVERS, [1], [drivers must progress])
 fi
 fi
 
 
+###############################################################################
+#                                                                             #
+#                                 MIC settings                                #
+#                                                                             #
+###############################################################################
+
+AC_MSG_CHECKING(maximum number of MIC devices)
+AC_ARG_ENABLE(maxmicdev, [AS_HELP_STRING([--enable-maxmicdev=<number>],
+			[maximum number of MIC devices])],
+			nmaxmicdev=$enableval, nmaxmicdev=4)
+AC_MSG_RESULT($nmaxmicdev)
+
+AC_DEFINE_UNQUOTED(STARPU_MAXMICDEVS, [$nmaxmicdev],
+	[maximum number of MIC devices])
+
+AC_MSG_CHECKING(maximum number of MIC cores)
+AC_ARG_ENABLE(maxmicdev, [AS_HELP_STRING([--enable-maxmiccore=<number>],
+			[maximum number of MIC cores])],
+			nmaxmiccore=$enableval, nmaxmiccore=128)
+AC_MSG_RESULT($nmaxmiccore)
+
+AC_DEFINE_UNQUOTED(STARPU_MAXMICCORES, [$nmaxmiccore],
+	[maximum number of MIC cores])
+
+AC_ARG_WITH(coi-dir,
+	[AS_HELP_STRING([--with-coi-dir=<path>],
+	[specify the MIC's COI installation directory])],
+	[coi_dir="$withval"],
+	[coi_dir=no])
+
+AC_ARG_WITH(coi-include-dir,
+	[AS_HELP_STRING([--with-coi-include-dir=<path>],
+	[specify where the MIC's COI headers are installed])],
+	[coi_include_dir="$withval"],
+	[coi_include_dir=no])
+
+AC_ARG_WITH(coi-lib-dir,
+	[AS_HELP_STRING([--with-coi-lib-dir=<path>],
+	[specify where the MIC's COI libraries are installed])],
+	[coi_lib_dir="$withval"],
+	[coi_lib_dir=no])
+
+AC_DEFUN([STARPU_CHECK_COI_RUNTIME],
+[
+    __coi_dir=$1
+    __coi_include_dir=$2
+    __coi_lib_dir=$3
+    __coi_lib_name=$4
+
+    if test "$__coi_dir" != "no" -a "$__coi_dir" != "" ; then
+	AC_MSG_CHECKING(whether MIC's COI runtime is available in $__coi_dir)
+    else
+	AC_MSG_CHECKING(whether MIC's COI runtime is available)
+    fi
+    AC_MSG_RESULT()
+
+    if test "$__coi_include_dir" = "no" -a "$__coi_dir" != "no" ; then
+        __coi_include_dir="${__coi_dir}/include"
+    fi
+    if test "$__coi_lib_dir" = "no" -a "$__coi_dir" != "no" ; then
+        __coi_lib_dir="${__coi_dir}/lib"
+    fi
+
+    SAVED_CPPFLAGS="$CPPFLAGS"
+    SAVED_LDFLAGS="$LDFLAGS"
+
+    if test "$__coi_include_dir" != "no" ; then
+        CPPFLAGS="${CPPFLAGS} -I$__coi_include_dir"
+    fi
+    if test "$__coi_lib_dir" != "no" ; then
+	LDFLAGS="${LDFLAGS} -L$__coi_lib_dir"
+    fi
+
+    AC_CHECK_HEADER([source/COIEngine_source.h],[have_valid_coi=yes],[have_valid_coi=no])
+
+    if test "$have_valid_coi" = "yes" ; then
+	AC_HAVE_LIBRARY([$__coi_lib_name],[have_valid_coi=yes],[have_valid_coi=no])
+
+        if test "$have_valid_coi" = "no" ; then
+            if test "$3" = "no" -a "$__coi_dir" != "no" ; then
+		# ${__coi_dir}/lib didn't work, let's try with lib64
+                __coi_lib_dir="$__coi_dir/lib64"
+		LDFLAGS="${SAVED_LDFLAGS} -L$__coi_lib_dir"
+	        AC_HAVE_LIBRARY([$__coi_lib_name],[have_valid_coi=yes],[have_valid_coi=no])
+            fi
+        fi
+    fi
+
+    if test "$have_valid_coi" = "yes" -a "$__coi_include_dir" != "no"; then
+        STARPU_COI_CPPFLAGS="-I$__coi_include_dir"
+    fi
+
+    if test "$have_valid_coi" = "yes" -a "$__coi_lib_dir" != "no"; then
+        STARPU_COI_LDFLAGS="-L$__coi_lib_dir -l$__coi_lib_name"
+    fi
+
+    CPPFLAGS="${SAVED_CPPFLAGS}"
+    LDFLAGS="${SAVED_LDFLAGS}"
+])
+
+if test x$enable_mic = xyes ; then
+
+    STARPU_CHECK_COI_RUNTIME($coi_dir, $coi_include_dir, $coi_lib_dir, "coi_host")
+
+    # Host runtime is not compatible, we are probably cross-compiling
+    # Let's have a look for the device runtime which lib has a different name
+    if test "$have_valid_coi" = "no" ; then
+	    STARPU_CHECK_COI_RUNTIME($coi_dir, $coi_include_dir, $coi_lib_dir, "coi_device")
+    fi
+
+    if test "$have_valid_coi" = "no" ; then
+	AC_MSG_ERROR([cannot find MIC's COI runtime])
+    fi
+
+    AC_SUBST(STARPU_COI_CPPFLAGS)
+    AC_SUBST(STARPU_COI_LDFLAGS)
+fi
+
+###############################################################################
+#                                                                             #
+#                                 SCC settings                                #
+#                                                                             #
+###############################################################################
+
+AC_ARG_ENABLE([rcce], [AS_HELP_STRING([--disable-rcce],
+			  [do not use SCC device(s)])], [], enable_rcce=maybe)
+
+nmaxsccdev=47
+AC_DEFINE_UNQUOTED(STARPU_MAXSCCDEVS, [$nmaxsccdev], [maximum number of SCC devices])
+
+AC_ARG_WITH(rcce-dir,
+			[AS_HELP_STRING([--with-rcce-dir=<path>],
+							[specify RCCE installation directory])],
+			[
+				rcce_dir="$withval"
+				enable_rcce=yes
+			],
+			rcce_dir=no)
+
+AC_ARG_WITH(rcce-include-dir,
+			[AS_HELP_STRING([--with-rcce-include-dir=<path>],
+							[specify where RCCE headers are installed])],
+			[
+				rcce_include_dir="$withval"
+				enable_rcce=yes
+			],
+			rcce_include_dir=no)
+
+AC_ARG_WITH(rcce-lib-dir,
+			[AS_HELP_STRING([--with-rcce-lib-dir=<path>],
+							[specify where RCCE libraries are installed])],
+			[
+			 	rcce_lib_dir="$withval"
+			 	enable_rcce=yes
+			],
+			rcce_lib_dir=no)
+
+if test x$enable_rcce = xyes -o x$enable_rcce = xmaybe ; then
+	have_valid_rcce=yes
+
+	SAVED_LDFLAGS="${LDFLAGS}"
+	SAVED_CPPFLAGS="${CPPFLAGS}"
+	SAVED_LIBS="${LIBS}"
+
+	if test x$rcce_include_dir != xno ; then
+		STARPU_RCCE_CPPFLAGS="-I${rcce_include_dir}"
+	elif test x$rcce_dir != xno ; then
+		STARPU_RCCE_CPPFLAGS="-I${rcce_dir}/include"
+	fi
+
+	CPPFLAGS="${CPPFLAGS} ${STARPU_RCCE_CPPFLAGS}"
+	AC_CHECK_HEADER([RCCE.h], [], [have_valid_rcce=no])
+
+	if test x$rcce_lib_dir != xno ; then
+		STARPU_RCCE_LDFLAGS="-L${rcce_lib_dir}"
+	elif test x$rcce_dir != xno ; then
+		STARPU_RCCE_LDFLAGS="-L${rcce_lib}/lib"
+	fi
+
+	LDFLAGS="${LDFLAGS} ${STARPU_RCCE_LDFLAGS}"
+	AC_CHECK_LIB([RCCE_bigflags_nongory_nopwrmgmt], [RCCE_init], [], [have_valid_rcce=no])
+
+	# in case RCCE was explicitely required, but is not available, this is an error
+	if test x$enable_rcce = xyes -a x$have_valid_rcce = xno ; then
+		AC_MSG_ERROR([cannot find RCCE library])
+	fi
+
+	if test x$have_valid_rcce = xyes ; then
+		STARPU_RCCE_CPPFLAGS="${STARPU_RCCE_CPPFLAGS} -DSCC"
+		STARPU_RCCE_LDFLAGS="${STARPU_RCCE_LDFLAGS} -lRCCE_bigflags_nongory_nopwrmgmt -ldl"
+
+		AC_DEFINE(STARPU_USE_SCC, [1], [SCC support is enabled])
+
+		AC_SUBST(STARPU_RCCE_CFLAGS)
+		AC_SUBST(STARPU_RCCE_CPPFLAGS)
+		AC_SUBST(STARPU_RCCE_LDFLAGS)
+	fi
+
+	LDFLAGS="${SAVED_LDFLAGS}"
+	CPPFLAGS="${SAVED_CPPFLAGS}"
+	LIBS="${SAVED_LIBS}"
+
+	enable_rcce=$have_valid_rcce
+fi
+
+AM_CONDITIONAL(STARPU_USE_SCC, test x$enable_rcce = xyes)
+
+AC_MSG_CHECKING(whether RCCE should be used)
+AC_MSG_RESULT($enable_rcce)
+
+
+###############################################################################
+#                                                                             #
+#                             MP Common settings                              #
+#                                                                             #
+###############################################################################
+
+AM_CONDITIONAL([STARPU_USE_MP], [test "x$enable_mic" = "xyes" -o "x$enable_mpi" = "xyes" -o "x$enable_rcce" = "xyes"])
+
+AC_ARG_ENABLE([export-dynamic], [AS_HELP_STRING([--disable-export-dynamic],
+			  [Prevent the linker from adding all symbols to the dynamic symbol table])], [], [])
+
+if test x$enable_mic = xyes -o x$enable_mpi = xyes -o x$enable_rcce = xyes ; then
+	AC_DEFINE(STARPU_USE_MP, [1], [Message-passing SINKs support
+		  is enabled])
+
+	if test x$enable_export_dynamic != xno ; then
+		STARPU_EXPORT_DYNAMIC="-rdynamic"
+	fi
+fi
+
+AC_SUBST(STARPU_EXPORT_DYNAMIC)
+
+# Computes the maximum number of different kernels a message-passing sink
+# can lookup for and launch.
+AC_MSG_CHECKING(Maximum number of message-passing kernels)
+AC_ARG_ENABLE(maxmpkernels, [AS_HELP_STRING([
+	      -enable-maxmpkernels=<number>],
+	      [maximum number of kernels a message-passing sink can lookup
+	      for and execute])],
+	      maxmpkernels=$enableval, maxmpkernels=10)
+AC_MSG_RESULT($maxmpkernels)
+AC_DEFINE_UNQUOTED(STARPU_MAXMPKERNELS, [$maxmpkernels],
+		[maximum number of message-passing kernels])
+
+###############################################################################
 
 
 ###############################################################################
 ###############################################################################
 #                                                                             #
 #                                                                             #
@@ -1075,6 +1359,7 @@ if test x$enable_simgrid = xyes ; then
 	maxnodes=16
 	maxnodes=16
 else
 else
 	# We have one memory node shared by all CPU workers, one node per GPU
 	# We have one memory node shared by all CPU workers, one node per GPU
+	# and per MIC device
 	nodes=1
 	nodes=1
 	if test x$enable_cuda = xyes ; then
 	if test x$enable_cuda = xyes ; then
 		# we could have used nmaxcudadev + 1, but this would certainly give an
 		# we could have used nmaxcudadev + 1, but this would certainly give an
@@ -1086,6 +1371,14 @@ else
 		# odd number.
 		# odd number.
 		nodes=`expr $nodes + $nmaxopencldev`
 		nodes=`expr $nodes + $nmaxopencldev`
 	fi
 	fi
+	if test x$enable_mic = xyes ; then
+		nodes=`expr $nodes + $nmaxmicdev`
+	fi
+	if test x$enable_rcce = xyes ; then
+		# Only 1 memory node for the shared memory.
+		nodes=`expr $nodes + 1`
+	fi
+
 	# set maxnodes to the next power of 2 greater than nodes
 	# set maxnodes to the next power of 2 greater than nodes
 	maxnodes=1
 	maxnodes=1
 	while test "$maxnodes" -lt "$nodes"
 	while test "$maxnodes" -lt "$nodes"
@@ -1137,7 +1430,7 @@ AC_CHECK_FUNCS([clock_gettime])
 
 
 # Compute the maximum number of workers (we round it to 16 for alignment
 # Compute the maximum number of workers (we round it to 16 for alignment
 # purposes).
 # purposes).
-nmaxworkers=`expr 16 \* \( \( $maxcpus + $nmaxcudadev + $nmaxopencldev + 15 \) / 16 \) `
+nmaxworkers=`expr 16 \* \( \( $maxcpus + $nmaxcudadev + $nmaxopencldev + $nmaxmiccore + $nmaxsccdev + 15 \) / 16 \) `
 AC_MSG_CHECKING(Maximum number of workers)
 AC_MSG_CHECKING(Maximum number of workers)
 AC_MSG_RESULT($nmaxworkers)
 AC_MSG_RESULT($nmaxworkers)
 AC_DEFINE_UNQUOTED(STARPU_NMAXWORKERS, [$nmaxworkers], [Maximum number of workers])
 AC_DEFINE_UNQUOTED(STARPU_NMAXWORKERS, [$nmaxworkers], [Maximum number of workers])
@@ -1868,6 +2161,7 @@ AC_SUBST([LIBSTARPU_LINK])
 # File configuration
 # File configuration
 AC_CONFIG_COMMANDS([executable-scripts], [
 AC_CONFIG_COMMANDS([executable-scripts], [
   chmod +x tests/regression/regression.sh
   chmod +x tests/regression/regression.sh
+  chmod +x tests/loader-cross.sh
   chmod +x gcc-plugin/tests/run-test
   chmod +x gcc-plugin/tests/run-test
   chmod +x tools/starpu_codelet_profile
   chmod +x tools/starpu_codelet_profile
   chmod +x tools/starpu_codelet_histo_profile
   chmod +x tools/starpu_codelet_histo_profile
@@ -1914,6 +2208,7 @@ AC_OUTPUT([
 	examples/Makefile
 	examples/Makefile
 	examples/stencil/Makefile
 	examples/stencil/Makefile
 	tests/Makefile
 	tests/Makefile
+	tests/loader-cross.sh
 	doc/Makefile
 	doc/Makefile
 	mpi/Makefile
 	mpi/Makefile
 	mpi/src/Makefile
 	mpi/src/Makefile
@@ -1937,6 +2232,8 @@ AC_MSG_NOTICE([
 	CPUs   enabled: $enable_cpu
 	CPUs   enabled: $enable_cpu
 	CUDA   enabled: $enable_cuda
 	CUDA   enabled: $enable_cuda
 	OpenCL enabled: $enable_opencl
 	OpenCL enabled: $enable_opencl
+	SCC    enabled: $enable_rcce
+	MIC    enabled: $enable_mic
 
 
 	Compile-time limits
 	Compile-time limits
 	(change these with --enable-maxcpus, --enable-maxcudadev,
 	(change these with --enable-maxcpus, --enable-maxcudadev,

+ 11 - 0
doc/chapters/advanced-examples.texi

@@ -61,6 +61,7 @@ void scal_sse_func(void *buffers[], void *cl_arg)
 struct starpu_codelet cl = @{
 struct starpu_codelet cl = @{
     .where = STARPU_CPU,
     .where = STARPU_CPU,
     .cpu_funcs = @{ scal_cpu_func, scal_sse_func, NULL @},
     .cpu_funcs = @{ scal_cpu_func, scal_sse_func, NULL @},
+    .cpu_funcs_name = @{ "scal_cpu_func", "scal_sse_func", NULL @},
     .nbuffers = 1,
     .nbuffers = 1,
     .modes = @{ STARPU_RW @}
     .modes = @{ STARPU_RW @}
 @};
 @};
@@ -100,6 +101,7 @@ struct starpu_codelet cl = @{
     .where = STARPU_CPU|STARPU_CUDA,
     .where = STARPU_CPU|STARPU_CUDA,
     .can_execute = can_execute,
     .can_execute = can_execute,
     .cpu_funcs = @{ cpu_func, NULL @},
     .cpu_funcs = @{ cpu_func, NULL @},
+    .cpu_funcs_name = @{ "cpu_func", NULL @},
     .cuda_funcs = @{ gpu_func, NULL @}
     .cuda_funcs = @{ gpu_func, NULL @}
     .nbuffers = 1,
     .nbuffers = 1,
     .modes = @{ STARPU_RW @}
     .modes = @{ STARPU_RW @}
@@ -146,6 +148,7 @@ struct starpu_codelet cl = @{
     .where = STARPU_CPU|STARPU_CUDA,
     .where = STARPU_CPU|STARPU_CUDA,
     .can_execute = can_execute,
     .can_execute = can_execute,
     .cpu_funcs = @{ cpu_func, NULL @},
     .cpu_funcs = @{ cpu_func, NULL @},
+    .cpu_funcs_name = @{ "cpu_func", NULL @},
     .cuda_funcs = @{ scal_gpu_13, scal_gpu_20, NULL @},
     .cuda_funcs = @{ scal_gpu_13, scal_gpu_20, NULL @},
     .nbuffers = 1,
     .nbuffers = 1,
     .modes = @{ STARPU_RW @}
     .modes = @{ STARPU_RW @}
@@ -359,6 +362,7 @@ static struct starpu_perfmodel mult_perf_model = @{
 struct starpu_codelet cl = @{
 struct starpu_codelet cl = @{
     .where = STARPU_CPU,
     .where = STARPU_CPU,
     .cpu_funcs = @{ cpu_mult, NULL @},
     .cpu_funcs = @{ cpu_mult, NULL @},
+    .cpu_funcs_name = @{ "cpu_mult", NULL @},
     .nbuffers = 3,
     .nbuffers = 3,
     .modes = @{ STARPU_R, STARPU_R, STARPU_W @},
     .modes = @{ STARPU_R, STARPU_R, STARPU_W @},
     /* for the scheduling policy to be able to use performance models */
     /* for the scheduling policy to be able to use performance models */
@@ -520,6 +524,7 @@ void func_cpu(void *descr[], void *_args)
 struct starpu_codelet mycodelet = @{
 struct starpu_codelet mycodelet = @{
         .where = STARPU_CPU,
         .where = STARPU_CPU,
         .cpu_funcs = @{ func_cpu, NULL @},
         .cpu_funcs = @{ func_cpu, NULL @},
+        .cpu_funcs_name = @{ "func_cpu", NULL @},
         .nbuffers = 2,
         .nbuffers = 2,
         .modes = @{ STARPU_RW, STARPU_RW @}
         .modes = @{ STARPU_RW, STARPU_RW @}
 @};
 @};
@@ -623,6 +628,7 @@ the codelets for initialization and reduction:
 struct starpu_codelet bzero_variable_cl =
 struct starpu_codelet bzero_variable_cl =
 @{
 @{
         .cpu_funcs = @{ bzero_variable_cpu, NULL @},
         .cpu_funcs = @{ bzero_variable_cpu, NULL @},
+        .cpu_funcs_name = @{ "bzero_variable_cpu", NULL @},
         .cuda_funcs = @{ bzero_variable_cuda, NULL @},
         .cuda_funcs = @{ bzero_variable_cuda, NULL @},
         .nbuffers = 1,
         .nbuffers = 1,
 @}
 @}
@@ -645,6 +651,7 @@ static void accumulate_variable_cuda(void *descr[], void *cl_arg)
 struct starpu_codelet accumulate_variable_cl =
 struct starpu_codelet accumulate_variable_cl =
 @{
 @{
         .cpu_funcs = @{ accumulate_variable_cpu, NULL @},
         .cpu_funcs = @{ accumulate_variable_cpu, NULL @},
+        .cpu_funcs_name = @{ "accumulate_variable_cpu", NULL @},
         .cuda_funcs = @{ accumulate_variable_cuda, NULL @},
         .cuda_funcs = @{ accumulate_variable_cuda, NULL @},
         .nbuffers = 1,
         .nbuffers = 1,
 @}
 @}
@@ -827,6 +834,7 @@ static struct starpu_codelet cl =
     .type = STARPU_FORKJOIN,
     .type = STARPU_FORKJOIN,
     .max_parallelism = INT_MAX,
     .max_parallelism = INT_MAX,
     .cpu_funcs = @{scal_cpu_func, NULL@},
     .cpu_funcs = @{scal_cpu_func, NULL@},
+    .cpu_funcs_name = @{"scal_cpu_func", NULL@},
     .nbuffers = 1,
     .nbuffers = 1,
 @};
 @};
 @end smallexample
 @end smallexample
@@ -870,6 +878,7 @@ static struct starpu_codelet cl =
     .type = STARPU_SPMD,
     .type = STARPU_SPMD,
     .max_parallelism = INT_MAX,
     .max_parallelism = INT_MAX,
     .cpu_funcs = @{ func, NULL @},
     .cpu_funcs = @{ func, NULL @},
+    .cpu_funcs_name = @{ "func", NULL @},
     .nbuffers = 1,
     .nbuffers = 1,
 @}
 @}
 @end smallexample
 @end smallexample
@@ -977,6 +986,7 @@ void opencl_to_cpu_func(void *buffers[], void *args);
 struct starpu_codelet opencl_to_cpu_cl = @{
 struct starpu_codelet opencl_to_cpu_cl = @{
     .where = STARPU_CPU,
     .where = STARPU_CPU,
     .cpu_funcs = @{ opencl_to_cpu_func, NULL @},
     .cpu_funcs = @{ opencl_to_cpu_func, NULL @},
+    .cpu_funcs_name = @{ "opencl_to_cpu_func", NULL @},
     .nbuffers = 1,
     .nbuffers = 1,
     .modes = @{ STARPU_RW @}
     .modes = @{ STARPU_RW @}
 @};
 @};
@@ -1287,6 +1297,7 @@ struct starpu_codelet dummy_big_cl =
 	.cuda_funcs = @{dummy_big_kernel, NULL@},
 	.cuda_funcs = @{dummy_big_kernel, NULL@},
 	.opencl_funcs = @{dummy_big_kernel, NULL@},
 	.opencl_funcs = @{dummy_big_kernel, NULL@},
 	.cpu_funcs = @{dummy_big_kernel, NULL@},
 	.cpu_funcs = @{dummy_big_kernel, NULL@},
+	.cpu_funcs_name = @{"dummy_big_kernel", NULL@},
 	.nbuffers = STARPU_NMAXBUFS+1,
 	.nbuffers = STARPU_NMAXBUFS+1,
 	.dyn_modes = modes
 	.dyn_modes = modes
 @};
 @};

+ 217 - 8
doc/chapters/api.texi

@@ -24,6 +24,8 @@
 * Theoretical lower bound on execution time API::
 * Theoretical lower bound on execution time API::
 * CUDA extensions::
 * CUDA extensions::
 * OpenCL extensions::
 * OpenCL extensions::
+* MIC extensions::
+* SCC extensions::
 * Miscellaneous helpers::
 * Miscellaneous helpers::
 * FXT Support::
 * FXT Support::
 * FFT Support::
 * FFT Support::
@@ -104,6 +106,14 @@ be specified with the @code{STARPU_NCUDA} environment variable.
 This is the number of OpenCL devices that StarPU can use. This can
 This is the number of OpenCL devices that StarPU can use. This can
 also be specified with the @code{STARPU_NOPENCL} environment variable.
 also be specified with the @code{STARPU_NOPENCL} environment variable.
 
 
+@item @code{int nmic} (default = -1)
+This is the number of MIC devices that StarPU can use. This can
+also be specified with the @code{STARPU_NMIC} environment variable.
+
+@item @code{int nscc} (default = -1)
+This is the number of SCC devices that StarPU can use. This can
+also be specified with the @code{STARPU_NSCC} environment variable.
+
 @item @code{unsigned use_explicit_workers_bindid} (default = 0)
 @item @code{unsigned use_explicit_workers_bindid} (default = 0)
 If this flag is set, the @code{workers_bindid} array indicates where the
 If this flag is set, the @code{workers_bindid} array indicates where the
 different workers are bound, otherwise StarPU automatically selects where to
 different workers are bound, otherwise StarPU automatically selects where to
@@ -139,6 +149,26 @@ the @code{STARPU_WORKERS_OPENCLID} environment variable.
 If the @code{use_explicit_workers_opencl_gpuid} flag is set, this array
 If the @code{use_explicit_workers_opencl_gpuid} flag is set, this array
 contains the logical identifiers of the OpenCL devices to be used.
 contains the logical identifiers of the OpenCL devices to be used.
 
 
+@item @code{unsigned use_explicit_workers_mic_gpuid} (default = 0)
+If this flag is set, the MIC workers will be attached to the MIC devices
+specified in the @code{workers_mic_gpuid} array. Otherwise, StarPU affects
+the MIC devices in a round-robin fashion. This can also be specified with
+the @code{STARPU_WORKERS_MICID} environment variable.
+
+@item @code{unsigned workers_mic_gpuid[STARPU_NMAXWORKERS]}
+If the @code{use_explicit_workers_mic_gpuid} flag is set, this array
+contains the logical identifiers of the MIC devices to be used.
+
+@item @code{unsigned use_explicit_workers_scc_gpuid} (default = 0)
+If this flag is set, the SCC workers will be attached to the SCC devices
+specified in the @code{workers_scc_gpuid} array. Otherwise, StarPU affects
+the SCC devices in a round-robin fashion. This can also be specified with
+the @code{STARPU_WORKERS_SCCID} environment variable.
+
+@item @code{unsigned workers_scc_gpuid[STARPU_NMAXWORKERS]}
+If the @code{use_explicit_workers_scc_gpuid} flag is set, this array
+contains the logical identifiers of the SCC devices to be used.
+
 @item @code{int calibrate} (default = 0)
 @item @code{int calibrate} (default = 0)
 If this flag is set, StarPU will calibrate the performance models when
 If this flag is set, StarPU will calibrate the performance models when
 executing tasks. If this value is equal to @code{-1}, the default value is
 executing tasks. If this value is equal to @code{-1}, the default value is
@@ -162,6 +192,11 @@ task scheduler will however still however still try varying combined worker
 sizes to look for the most efficient ones.
 sizes to look for the most efficient ones.
 This can also be specified with the @code{STARPU_SINGLE_COMBINED_WORKER} environment variable.
 This can also be specified with the @code{STARPU_SINGLE_COMBINED_WORKER} environment variable.
 
 
+@item @code{mic_sink_program_path} (default = NULL)
+Path to the program to execute on the MIC device, compiled for MIC
+architecture. When set to NULL, StarPU automatically looks next to the host
+program location.
+
 @item @code{int disable_asynchronous_copy} (default = 0)
 @item @code{int disable_asynchronous_copy} (default = 0)
 This flag should be set to 1 to disable asynchronous copies between
 This flag should be set to 1 to disable asynchronous copies between
 CPUs and all accelerators. This can also be specified with the
 CPUs and all accelerators. This can also be specified with the
@@ -189,6 +224,13 @@ it is therefore necessary to disable asynchronous data transfers.
 This can also be specified at compilation time by giving to the
 This can also be specified at compilation time by giving to the
 configure script the option @code{--disable-asynchronous-opencl-copy}.
 configure script the option @code{--disable-asynchronous-opencl-copy}.
 
 
+@item @code{int disable_asynchronous_mic_copy} (default = 0)
+This flag should be set to 1 to disable asynchronous copies between
+CPUs and MIC accelerators. This can also be specified with the
+@code{STARPU_DISABLE_ASYNCHRONOUS_MIC_COPY} environment variable.
+This can also be specified at compilation time by giving to the
+configure script the option @code{--disable-asynchronous-mic-copy}.
+
 @item @code{int *cuda_opengl_interoperability} (default = NULL)
 @item @code{int *cuda_opengl_interoperability} (default = NULL)
 This can be set to an array of CUDA device identifiers for which
 This can be set to an array of CUDA device identifiers for which
 @code{cudaGLSetGLDevice} should be called instead of @code{cudaSetDevice}. Its
 @code{cudaGLSetGLDevice} should be called instead of @code{cudaSetDevice}. Its
@@ -223,6 +265,12 @@ Upon successful completion, this function returns 0. Otherwise, @code{-ENODEV}
 indicates that no worker was available (so that StarPU was not initialized).
 indicates that no worker was available (so that StarPU was not initialized).
 @end deftypefun
 @end deftypefun
 
 
+@deftypefun int starpu_initialize ({struct starpu_conf *}@var{conf}, int @var{argc}, {char ***}@var{argv})
+This is the same as @code{starpu_init}, but also takes the @code{argc} and
+@code{argv} as gotten by the application. This is needed for SCC
+execution to initialize the communication library.
+@end deftypefun
+
 @deftypefun int starpu_conf_init ({struct starpu_conf *}@var{conf})
 @deftypefun int starpu_conf_init ({struct starpu_conf *}@var{conf})
 This function initializes the @var{conf} structure passed as argument
 This function initializes the @var{conf} structure passed as argument
 with the default values. In case some configuration parameters are already
 with the default values. In case some configuration parameters are already
@@ -319,6 +367,8 @@ The different values are:
 @item @code{STARPU_CPU_WORKER}
 @item @code{STARPU_CPU_WORKER}
 @item @code{STARPU_CUDA_WORKER}
 @item @code{STARPU_CUDA_WORKER}
 @item @code{STARPU_OPENCL_WORKER}
 @item @code{STARPU_OPENCL_WORKER}
+@item @code{STARPU_MIC_WORKER}
+@item @code{STARPU_SCC_WORKER}
 @end table
 @end table
 @end deftp
 @end deftp
 
 
@@ -348,6 +398,20 @@ This function returns the number of OpenCL devices controlled by StarPU. The ret
 value should be at most @code{STARPU_MAXOPENCLDEVS}.
 value should be at most @code{STARPU_MAXOPENCLDEVS}.
 @end deftypefun
 @end deftypefun
 
 
+@deftypefun unsigned starpu_mic_worker_get_count (void)
+This function returns the number of MIC workers controlled by StarPU.
+@end deftypefun
+
+@deftypefun unsigned starpu_mic_device_get_count (void)
+This function returns the number of MIC devices controlled by StarPU. The returned
+value should be at most @code{STARPU_MAXMICDEVS}.
+@end deftypefun
+
+@deftypefun unsigned starpu_scc_worker_get_count (void)
+This function returns the number of SCC devices controlled by StarPU. The returned
+value should be at most @code{STARPU_MAXSCCDEVS}.
+@end deftypefun
+
 @deftypefun int starpu_worker_get_id (void)
 @deftypefun int starpu_worker_get_id (void)
 This function returns the identifier of the current worker, i.e the one associated to the calling
 This function returns the identifier of the current worker, i.e the one associated to the calling
 thread. The returned value is either -1 if the current context is not a StarPU
 thread. The returned value is either -1 if the current context is not a StarPU
@@ -420,6 +484,9 @@ todo
 @item @code{STARPU_CPU_RAM}
 @item @code{STARPU_CPU_RAM}
 @item @code{STARPU_CUDA_RAM}
 @item @code{STARPU_CUDA_RAM}
 @item @code{STARPU_OPENCL_RAM}
 @item @code{STARPU_OPENCL_RAM}
+@item @code{STARPU_MIC_RAM}
+@item @code{STARPU_SCC_RAM}
+@item @code{STARPU_SCC_SHM}
 @end table
 @end table
 @end deftp
 @end deftp
 
 
@@ -501,6 +568,12 @@ ignored for now.
 @end table
 @end table
 @end deftp
 @end deftp
 
 
+In addition to that, @code{STARPU_COMMUTE} can be passed along @code{STARPU_W}
+or @code{STARPU_RW} to express that StarPU can let tasks commute, which is
+useful e.g. when bringing a contribution into some data, which can be done
+in any order (but still require sequential consistency against reads or
+non-commutative writes).
+
 @deftp {Data Type} {starpu_data_handle_t}
 @deftp {Data Type} {starpu_data_handle_t}
 StarPU uses @code{starpu_data_handle_t} as an opaque handle to manage a piece of
 StarPU uses @code{starpu_data_handle_t} as an opaque handle to manage a piece of
 data. Once a piece of data has been registered to StarPU, it is associated to a
 data. Once a piece of data has been registered to StarPU, it is associated to a
@@ -1364,8 +1437,8 @@ be useful to provide more specific method in case of e.g. available particular
 CUDA or OpenCL support.
 CUDA or OpenCL support.
 
 
 @table @asis
 @table @asis
-@item @code{int (*@{ram,cuda,opencl@}_to_@{ram,cuda,opencl@})(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)}
-These 12 functions define how to copy data from the @var{src_interface}
+@item @code{int (*@{ram,cuda,opencl,mic@}_to_@{ram,cuda,opencl,mic@})(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)}
+These 14 functions define how to copy data from the @var{src_interface}
 interface on the @var{src_node} node to the @var{dst_interface} interface
 interface on the @var{src_node} node to the @var{dst_interface} interface
 on the @var{dst_node} node. They return 0 on success.
 on the @var{dst_node} node. They return 0 on success.
 
 
@@ -1386,6 +1459,22 @@ Must return 0 if the transfer was actually completed completely synchronously,
 or -EAGAIN if at least some transfers are still ongoing and should be awaited
 or -EAGAIN if at least some transfers are still ongoing and should be awaited
 for by the core.
 for by the core.
 
 
+@item @code{int (*@{ram,mic@}_to_@{ram,mic@}_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)}
+These 2 functions (@code{ram_to_ram} and @code{mic_to_mic} are not among them) define how to copy
+data from the @var{src_interface} interface on the @var{src_node} node to the
+@var{dst_interface} interface on the @var{dst_node} node.
+Must return 0 if the transfer was actually completed completely synchronously,
+or -EAGAIN if at least some transfers are still ongoing and should be awaited
+for by the core.
+
+@item @code{int (*@{src,sink@}_to_@{src,sink@}_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)}
+These 3 functions (@code{src_to_src} is not among them) define how to copy
+data from the @var{src_interface} interface on the @var{src_node} node to the
+@var{dst_interface} interface on the @var{dst_node} node.
+Must return 0 if the transfer was actually completed completely synchronously,
+or -EAGAIN if at least some transfers are still ongoing and should be awaited
+for by the core.
+
 @item @code{int (*any_to_any)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data)}
 @item @code{int (*any_to_any)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data)}
 Define how to copy data from the @var{src_interface} interface on the
 Define how to copy data from the @var{src_interface} interface on the
 @var{src_node} node to the @var{dst_interface} interface on the @var{dst_node}
 @var{src_node} node to the @var{dst_interface} interface on the @var{dst_node}
@@ -1729,6 +1818,24 @@ pointer to a codelet which converts from CPU to CUDA
 
 
 @item @code{struct starpu_codelet *cuda_to_cpu_cl}
 @item @code{struct starpu_codelet *cuda_to_cpu_cl}
 pointer to a codelet which converts from CUDA to CPU
 pointer to a codelet which converts from CUDA to CPU
+
+@item @code{size_t mic_elemsize}
+the size of each element on MIC devices,
+
+@item @code{struct starpu_codelet *cpu_to_mic_cl}
+pointer to a codelet which converts from CPU to MIC
+
+@item @code{struct starpu_codelet *mic_to_cpu_cl}
+pointer to a codelet which converts from MIC to CPU
+
+@item @code{size_t scc_elemsize}
+the size of each element on SCC devices,
+
+@item @code{struct starpu_codelet *cpu_to_scc_cl}
+pointer to a codelet which converts from CPU to SCC
+
+@item @code{struct starpu_codelet *scc_to_cpu_cl}
+pointer to a codelet which converts from SCC to CPU
 @end table
 @end table
 @end deftp
 @end deftp
 
 
@@ -1791,7 +1898,19 @@ processing unit.
 
 
 @defmac STARPU_OPENCL
 @defmac STARPU_OPENCL
 This macro is used when setting the field @code{where} of a @code{struct
 This macro is used when setting the field @code{where} of a @code{struct
-starpu_codelet} to specify the codelet may be executed on a OpenCL
+starpu_codelet} to specify the codelet may be executed on an OpenCL
+processing unit.
+@end defmac
+
+@defmac STARPU_MIC
+This macro is used when setting the field @code{where} of a @code{struct
+starpu_codelet} to specify the codelet may be executed on a MIC
+processing unit.
+@end defmac
+
+@defmac STARPU_SCC
+This macro is used when setting the field @code{where} of a @code{struct
+starpu_codelet} to specify the codelet may be executed on an SCC
 processing unit.
 processing unit.
 @end defmac
 @end defmac
 
 
@@ -1864,6 +1983,12 @@ If the @code{where} field is set, then the @code{cpu_funcs} field is
 ignored if @code{STARPU_CPU} does not appear in the @code{where}
 ignored if @code{STARPU_CPU} does not appear in the @code{where}
 field, it must be non-null otherwise.
 field, it must be non-null otherwise.
 
 
+@item @code{char * cpu_funcs_name[STARPU_MAXIMPLEMENTATIONS]} (optional)
+Is an array of strings which provide the name of the CPU functions referenced in
+the @code{cpu_funcs} array. This can be used when running on MIC devices or the
+SCC platform, for StarPU to simply look up the MIC function implementation
+through its name.
+
 @item @code{starpu_cuda_func_t cuda_func} (optional)
 @item @code{starpu_cuda_func_t cuda_func} (optional)
 This field has been made deprecated. One should use instead the
 This field has been made deprecated. One should use instead the
 @code{cuda_funcs} field.
 @code{cuda_funcs} field.
@@ -1891,6 +2016,28 @@ If the @code{where} field is set, then the @code{opencl_funcs} field
 is ignored if @code{STARPU_OPENCL} does not appear in the @code{where}
 is ignored if @code{STARPU_OPENCL} does not appear in the @code{where}
 field, it must be non-null otherwise.
 field, it must be non-null otherwise.
 
 
+@item @code{starpu_mic_func_t mic_funcs[STARPU_MAXIMPLEMENTATIONS]} (optional)
+Is an array of function pointers to a function which returns the MIC
+implementation of the codelet.
+It must be terminated by a NULL value.
+The functions prototype must be:
+@code{starpu_mic_kernel_t mic_func(struct starpu_codelet *cl, unsigned nimpl);}.
+If the @code{where} field is set, then the @code{mic_funcs} field
+is ignored if @code{STARPU_MIC} does not appear in the @code{where}
+field. It can be null if @code{cpu_funcs_name} is non-NULL, in which case StarPU
+will simply make a symbol lookup to get the implementation.
+
+@item @code{starpu_scc_func_t scc_funcs[STARPU_MAXIMPLEMENTATIONS]} (optional)
+Is an array of function pointers to a function which returns the SCC
+implementation of the codelet.
+It must be terminated by a NULL value.
+The functions prototype must be:
+@code{starpu_scc_kernel_t scc_func(struct starpu_codelet *cl, unsigned nimpl);}.
+If the @code{where} field is set, then the @code{scc_funcs} field
+is ignored if @code{STARPU_SCC} does not appear in the @code{where}
+field. It can be null if @code{cpu_funcs_name} is non-NULL, in which case StarPU
+will simply make a symbol lookup to get the implementation.
+
 @item @code{unsigned nbuffers}
 @item @code{unsigned nbuffers}
 Specifies the number of arguments taken by the codelet. These arguments are
 Specifies the number of arguments taken by the codelet. These arguments are
 managed by the DSM and are accessed from the @code{void *buffers[]}
 managed by the DSM and are accessed from the @code{void *buffers[]}
@@ -2295,6 +2442,7 @@ executing. It thus does not include tasks waiting for dependencies.
 This function returns the task currently executed by the worker, or
 This function returns the task currently executed by the worker, or
 NULL if it is called either from a thread that is not a task or simply
 NULL if it is called either from a thread that is not a task or simply
 because there is no task being executed at the moment.
 because there is no task being executed at the moment.
+This function must be called from the callback (not from the codelet).
 @end deftypefun
 @end deftypefun
 
 
 @deftypefun void starpu_codelet_display_stats ({struct starpu_codelet} *@var{cl})
 @deftypefun void starpu_codelet_display_stats ({struct starpu_codelet} *@var{cl})
@@ -3271,6 +3419,48 @@ successfull. It returns 0 if the synchronous copy was successful, or
 fails otherwise.
 fails otherwise.
 @end deftypefun
 @end deftypefun
 
 
+@node MIC extensions
+@section MIC extensions
+
+@defmac STARPU_USE_MIC
+This macro is defined when StarPU has been installed with MIC
+support. It should be used in your code to detect the availability of
+MIC.
+@end defmac
+
+@deftypefun int starpu_mic_register_kernel({starpu_mic_func_symbol_t *}@var{symbol}, {const char *}@var{func_name})
+Initiate a lookup on each MIC device to find the adress of the function
+named FUNC_NAME, store them in the global array kernels and return
+the index in the array through SYMBOL.
+@end deftypefun
+
+@deftypefun starpu_mic_kernel_t starpu_mic_get_kernel(starpu_mic_func_symbol_t @var{symbol})
+If success, return the pointer to the function defined by SYMBOL on the
+device linked to the called device. This can for instance be used in a
+@code{starpu_mic_func_t} implementation.
+@end deftypefun
+
+@node SCC extensions
+@section SCC extensions
+
+@defmac STARPU_USE_SCC
+This macro is defined when StarPU has been installed with SCC
+support. It should be used in your code to detect the availability of
+SCC.
+@end defmac
+
+@deftypefun int starpu_scc_register_kernel({starpu_scc_func_symbol_t *}@var{symbol}, {const char *}@var{func_name})
+Initiate a lookup on each SCC device to find the adress of the function
+named FUNC_NAME, store them in the global array kernels and return
+the index in the array through SYMBOL.
+@end deftypefun
+
+@deftypefun starpu_scc_kernel_t starpu_scc_get_kernel(starpu_scc_func_symbol_t @var{symbol})
+If success, return the pointer to the function defined by SYMBOL on the
+device linked to the called device. This can for instance be used in a
+@code{starpu_scc_func_t} implementation.
+@end deftypefun
+
 @node Miscellaneous helpers
 @node Miscellaneous helpers
 @section Miscellaneous helpers
 @section Miscellaneous helpers
 
 
@@ -3824,7 +4014,11 @@ number of CUDA workers.
 
 
 @item @code{unsigned nhwopenclgpus}
 @item @code{unsigned nhwopenclgpus}
 Total number of OpenCL devices, as detected. May be different from the actual
 Total number of OpenCL devices, as detected. May be different from the actual
-number of CUDA workers.
+number of OpenCL workers.
+
+@item @code{unsigned nhscc}
+Total number of SCC cores, as detected. May be different from the actual
+number of core workers.
 
 
 @item @code{unsigned ncpus}
 @item @code{unsigned ncpus}
 Actual number of CPU workers used by StarPU.
 Actual number of CPU workers used by StarPU.
@@ -3835,6 +4029,9 @@ Actual number of CUDA workers used by StarPU.
 @item @code{unsigned nopenclgpus}
 @item @code{unsigned nopenclgpus}
 Actual number of OpenCL workers used by StarPU.
 Actual number of OpenCL workers used by StarPU.
 
 
+@item @code{unsigned nsccdevices}
+Actual number of SCC workers used by StarPU.
+
 @item @code{unsigned workers_bindid[STARPU_NMAXWORKERS]}
 @item @code{unsigned workers_bindid[STARPU_NMAXWORKERS]}
 Indicates the successive cpu identifier that should be used to bind the
 Indicates the successive cpu identifier that should be used to bind the
 workers. It is either filled according to the user's explicit
 workers. It is either filled according to the user's explicit
@@ -3843,17 +4040,29 @@ variable. Otherwise, a round-robin policy is used to distributed the workers
 over the cpus.
 over the cpus.
 
 
 @item @code{unsigned workers_cuda_gpuid[STARPU_NMAXWORKERS]}
 @item @code{unsigned workers_cuda_gpuid[STARPU_NMAXWORKERS]}
-Indicates the successive cpu identifier that should be used by the CUDA
+Indicates the successive CUDA identifier that should be used by the CUDA
 driver.  It is either filled according to the user's explicit parameters (from
 driver.  It is either filled according to the user's explicit parameters (from
 starpu_conf) or according to the STARPU_WORKERS_CUDAID env. variable. Otherwise,
 starpu_conf) or according to the STARPU_WORKERS_CUDAID env. variable. Otherwise,
 they are taken in ID order.
 they are taken in ID order.
 
 
 @item @code{unsigned workers_opencl_gpuid[STARPU_NMAXWORKERS]}
 @item @code{unsigned workers_opencl_gpuid[STARPU_NMAXWORKERS]}
-Indicates the successive cpu identifier that should be used by the OpenCL
+Indicates the successive OpenCL identifier that should be used by the OpenCL
 driver.  It is either filled according to the user's explicit parameters (from
 driver.  It is either filled according to the user's explicit parameters (from
 starpu_conf) or according to the STARPU_WORKERS_OPENCLID env. variable. Otherwise,
 starpu_conf) or according to the STARPU_WORKERS_OPENCLID env. variable. Otherwise,
 they are taken in ID order.
 they are taken in ID order.
 
 
+@item @code{unsigned workers_mic_deviceid[STARPU_NMAXWORKERS]}
+Indicates the successive MIC devices that should be used by the MIC
+driver.  It is either filled according to the user's explicit parameters (from
+starpu_conf) or according to the STARPU_WORKERS_MICID env. variable. Otherwise,
+they are taken in ID order.
+
+@item @code{unsigned workers_scc_deviceid[STARPU_NMAXWORKERS]}
+Indicates the successive SCC devices that should be used by the SCC
+driver.  It is either filled according to the user's explicit parameters (from
+starpu_conf) or according to the STARPU_WORKERS_SCCID env. variable. Otherwise,
+they are taken in ID order.
+
 @end table
 @end table
 @end deftp
 @end deftp
 
 
@@ -3892,7 +4101,7 @@ The workerids managed by the collection
 The number of workerids
 The number of workerids
 @item @code{pthread_key_t cursor_key} (optional)
 @item @code{pthread_key_t cursor_key} (optional)
 The cursor needed to iterate the collection (depending on the data structure)
 The cursor needed to iterate the collection (depending on the data structure)
-@item @code{int type}
+@item @code{enum starpu_worker_collection_type type}
 The type of structure (currently STARPU_WORKER_LIST is the only one available)
 The type of structure (currently STARPU_WORKER_LIST is the only one available)
 @item @code{unsigned (*has_next)(struct starpu_worker_collection *workers)}
 @item @code{unsigned (*has_next)(struct starpu_worker_collection *workers)}
 Checks if there is a next worker
 Checks if there is a next worker
@@ -3914,7 +4123,7 @@ Deinitialize the cursor if there is one
 @end table
 @end table
 @end deftp
 @end deftp
 
 
-@deftypefun struct starpu_worker_collection* starpu_sched_ctx_create_worker_collection (unsigned @var{sched_ctx_id}, int @var{type})
+@deftypefun struct starpu_worker_collection* starpu_sched_ctx_create_worker_collection (unsigned @var{sched_ctx_id}, enum starpu_worker_collection_type @var{type})
 Create a worker collection of the type indicated by the last parameter for the context specified through the first parameter.
 Create a worker collection of the type indicated by the last parameter for the context specified through the first parameter.
 @end deftypefun
 @end deftypefun
 
 

+ 3 - 0
doc/chapters/basic-examples.texi

@@ -132,6 +132,7 @@ struct starpu_codelet cl =
 @{
 @{
     .where = STARPU_CPU,
     .where = STARPU_CPU,
     .cpu_funcs = @{ cpu_func, NULL @},
     .cpu_funcs = @{ cpu_func, NULL @},
+    .cpu_funcs_name = @{ "cpu_func", NULL @},
     .nbuffers = 0
     .nbuffers = 0
 @};
 @};
 @end smallexample
 @end smallexample
@@ -642,6 +643,7 @@ void scal_cpu_func(void *buffers[], void *cl_arg)
 struct starpu_codelet cl =
 struct starpu_codelet cl =
 @{
 @{
     .cpu_funcs = @{ scal_cpu_func, NULL @},
     .cpu_funcs = @{ scal_cpu_func, NULL @},
+    .cpu_funcs_name = @{ "scal_cpu_func", NULL @},
     .nbuffers = 1,
     .nbuffers = 1,
     .modes = @{ STARPU_RW @}
     .modes = @{ STARPU_RW @}
 @};
 @};
@@ -822,6 +824,7 @@ static struct starpu_codelet cl =
 @{
 @{
     .cuda_funcs = @{ scal_cuda_func, NULL @},
     .cuda_funcs = @{ scal_cuda_func, NULL @},
     .cpu_funcs = @{ scal_cpu_func, NULL @},
     .cpu_funcs = @{ scal_cpu_func, NULL @},
+    .cpu_funcs_name = @{ "scal_cpu_func", NULL @},
     .opencl_funcs = @{ scal_opencl_func, NULL @},
     .opencl_funcs = @{ scal_opencl_func, NULL @},
     .nbuffers = 1,
     .nbuffers = 1,
     .modes = @{ STARPU_RW @}
     .modes = @{ STARPU_RW @}

+ 4 - 0
doc/chapters/configuration.texi

@@ -473,6 +473,10 @@ fail when copying data asynchronously. When using this implementation,
 it is therefore necessary to disable asynchronous data transfers.
 it is therefore necessary to disable asynchronous data transfers.
 @end defvr
 @end defvr
 
 
+@defvr {Environment variable} STARPU_DISABLE_ASYNCHRONOUS_MIC_COPY
+Disable asynchronous copies between CPU and MIC devices.
+@end defvr
+
 @defvr {Environment variable} STARPU_DISABLE_CUDA_GPU_GPU_DIRECT
 @defvr {Environment variable} STARPU_DISABLE_CUDA_GPU_GPU_DIRECT
 Disable direct CUDA transfers from GPU to GPU, and let CUDA copy through RAM
 Disable direct CUDA transfers from GPU to GPU, and let CUDA copy through RAM
 instead. This permits to test the performance effect of GPU-Direct.
 instead. This permits to test the performance effect of GPU-Direct.

+ 55 - 0
doc/chapters/mic-scc-support.texi

@@ -0,0 +1,55 @@
+@c -*-texinfo-*-
+
+@c This file is part of the StarPU Handbook.
+@c Copyright (C) 2013  Universit@'e de Bordeaux 1
+@c See the file starpu.texi for copying conditions.
+
+@section Compilation
+
+SCC support just needs the presence of the RCCE library.
+
+MIC support actually needs two compilations of StarPU, one for the host and one for
+the device. The @code{mic-configure} script can be used to achieve this: it basically
+calls @code{configure} as appropriate from two new directories: @code{build_mic} and
+@code{build_host}. @code{make} and @code{make install} can then be used as usual and will
+recurse into both directories.
+
+@c TODO: move to configuration section ?
+
+It can be parameterized with the following environment variables:
+
+@table @asis
+@item @code{MIC_HOST}
+Defines the value of the @code{--host} parameter passed to @code{configure} for the
+cross-compilation. The current default is @code{x86_64-k1om-linux}.
+
+@item @code{MIC_CC_PATH}
+Defines the path to the MIC cross-compiler. The current default is @code{/usr/linux-k1om-4.7/bin/}.
+
+@item @code{COI_DIR}
+Defines the path to the COI library. The current default is @code{/opt/intel/mic/coi}
+@end table
+
+@section Porting applications to MIC/SCC
+
+The simplest way to port an application to MIC/SCC is to add the
+@code{cpu_funcs_name} field in the codelet, to provide StarPU with the function
+name of the CPU implementation. StarPU will thus simply use the existing CPU
+implementation (cross-rebuilt in the MIC case). The functions have to be
+globally-visible (i.e. not @code{static}) for StarPU to be able to look them up.
+
+For SCC execution, @code{starpu_initialize} also has to be used instead of @code{starpu_init}, so
+as to pass @code{argc} and @code{argv}.
+
+@section Launching programs
+
+SCC programs are started through RCCE
+
+MIC programs are started from the host. StarPU automatically
+starts the same program on MIC devices. It however needs to get
+the MIC-cross-built binary. It will look for the file given by the
+@code{STARPU_MIC_SINK_PROGRAM_NAME} environment variable or in the directory
+given by the @code{STARPU_MIC_SINK_PROGRAM_PATH} environment variable, or in
+the @code{mic_sink_program_path} field of the @code{starpu_config} structure.
+It will also look in the current directory for the same binary name plus a
+@code{-mic} or @code{_mic} suffix.

+ 4 - 1
doc/chapters/perf-optimization.texi

@@ -79,7 +79,9 @@ dependencies on that data.
 
 
 In the same vein, accumulation of results in the same data can become a
 In the same vein, accumulation of results in the same data can become a
 bottleneck. The use of the @code{STARPU_REDUX} mode permits to optimize such
 bottleneck. The use of the @code{STARPU_REDUX} mode permits to optimize such
-accumulation (@pxref{Data reduction}).
+accumulation (@pxref{Data reduction}). To a lesser extent, the use of the
+@code{STARPU_COMMUTE} flag keeps the bottleneck, but at least permits the
+accumulation to happen in any order.
 
 
 Applications often need a data just for temporary results.  In such a case,
 Applications often need a data just for temporary results.  In such a case,
 registration can be made without an initial value, for instance this produces a vector data:
 registration can be made without an initial value, for instance this produces a vector data:
@@ -554,6 +556,7 @@ CUDA or OpenCL execution:
 static struct starpu_codelet cl11 =
 static struct starpu_codelet cl11 =
 @{
 @{
 	.cpu_funcs = @{chol_cpu_codelet_update_u11, NULL@},
 	.cpu_funcs = @{chol_cpu_codelet_update_u11, NULL@},
+	.cpu_funcs_name = @{"chol_cpu_codelet_update_u11", NULL@},
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = @{chol_cublas_codelet_update_u11, NULL@},
 	.cuda_funcs = @{chol_cublas_codelet_update_u11, NULL@},
 #elif defined(STARPU_SIMGRID)
 #elif defined(STARPU_SIMGRID)

+ 3 - 1
doc/chapters/sc_hypervisor.texi

@@ -106,7 +106,9 @@ parallel kernels and the number of instruction to be executed by each task.
 The number of flops to be executed by a context are passed as parameter when they are registered to the hypervisor,
 The number of flops to be executed by a context are passed as parameter when they are registered to the hypervisor,
  (@code{sc_hypervisor_register_ctx(sched_ctx_id, flops)}) and the one to be executed by each task are passed when the task is submitted.
  (@code{sc_hypervisor_register_ctx(sched_ctx_id, flops)}) and the one to be executed by each task are passed when the task is submitted.
 The corresponding field in the @code{starpu_task} data structure is @code{flops} and
 The corresponding field in the @code{starpu_task} data structure is @code{flops} and
-the corresponding macro in @code{starpu_insert_task} function is @code{STARPU_FLOPS}. When the task is executed
+the corresponding macro in @code{starpu_insert_task} function is
+@code{STARPU_FLOPS} (but take care of passing a double, not an integer, otherwise
+parameter passing will be bogus). When the task is executed
 the resizing process is triggered.
 the resizing process is triggered.
 @cartouche
 @cartouche
 @smallexample
 @smallexample

+ 2 - 1
doc/chapters/vector_scal_c.texi

@@ -1,7 +1,7 @@
 @c -*-texinfo-*-
 @c -*-texinfo-*-
 
 
 @c This file is part of the StarPU Handbook.
 @c This file is part of the StarPU Handbook.
-@c Copyright (C) 2009-2011  Université de Bordeaux 1
+@c Copyright (C) 2009-2011, 2013  Université de Bordeaux 1
 @c Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
 @c Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
 @c See the file starpu.texi for copying conditions.
 @c See the file starpu.texi for copying conditions.
 
 
@@ -26,6 +26,7 @@ static struct starpu_codelet cl = @{
     .where = STARPU_CPU | STARPU_CUDA | STARPU_OPENCL,
     .where = STARPU_CPU | STARPU_CUDA | STARPU_OPENCL,
     /* CPU implementation of the codelet */
     /* CPU implementation of the codelet */
     .cpu_funcs = @{ scal_cpu_func, scal_sse_func, NULL @},
     .cpu_funcs = @{ scal_cpu_func, scal_sse_func, NULL @},
+    .cpu_funcs_name = @{ "scal_cpu_func", "scal_sse_func", NULL @},
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
     /* CUDA implementation of the codelet */
     /* CUDA implementation of the codelet */
     .cuda_funcs = @{ scal_cuda_func, NULL @},
     .cuda_funcs = @{ scal_cuda_func, NULL @},

+ 9 - 0
doc/starpu.texi

@@ -74,6 +74,7 @@ was last updated on @value{UPDATED}.
 * Tips and Tricks::             Tips and tricks to know about
 * Tips and Tricks::             Tips and tricks to know about
 * StarPU MPI support::          How to combine StarPU with MPI
 * StarPU MPI support::          How to combine StarPU with MPI
 * StarPU FFT support::          How to perform FFT computations with StarPU
 * StarPU FFT support::          How to perform FFT computations with StarPU
+* StarPU MIC/SCC support::      How to build and run StarPU applications on MIC and SCC
 * C Extensions::                Easier StarPU programming with GCC
 * C Extensions::                Easier StarPU programming with GCC
 * SOCL OpenCL Extensions::      How to use OpenCL on top of StarPU
 * SOCL OpenCL Extensions::      How to use OpenCL on top of StarPU
 * Scheduling Contexts in StarPU::         How to use Scheduling Context of StarPU
 * Scheduling Contexts in StarPU::         How to use Scheduling Context of StarPU
@@ -163,6 +164,14 @@ was last updated on @value{UPDATED}.
 @include chapters/fft-support.texi
 @include chapters/fft-support.texi
 
 
 @c ---------------------------------------------------------------------
 @c ---------------------------------------------------------------------
+@c MIC/SCC support
+@c ---------------------------------------------------------------------
+
+@node StarPU MIC/SCC support
+@chapter StarPU MIC/SCC support
+@include chapters/mic-scc-support.texi
+
+@c ---------------------------------------------------------------------
 @c C Extensions
 @c C Extensions
 @c ---------------------------------------------------------------------
 @c ---------------------------------------------------------------------
 
 

+ 9 - 1
examples/Makefile.am

@@ -20,7 +20,7 @@ AM_CFLAGS = $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) -Wall $(STARPU_CUDA_CPPFLAGS) $(STAR
 AM_CXXFLAGS = $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS)
 AM_CXXFLAGS = $(MAGMA_CFLAGS) $(HWLOC_CFLAGS) -Wall $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS)
 LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ $(MAGMA_LIBS) $(HWLOC_LIBS) @LIBS@
 LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ $(MAGMA_LIBS) $(HWLOC_LIBS) @LIBS@
 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/examples/ -I$(top_builddir)/include
 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/examples/ -I$(top_builddir)/include
-AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) 
+AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(STARPU_COI_LDFLAGS)
 
 
 SUBDIRS = stencil
 SUBDIRS = stencil
 
 
@@ -138,10 +138,15 @@ endif
 
 
 if !STARPU_HAVE_WINDOWS
 if !STARPU_HAVE_WINDOWS
 ## test loader program
 ## test loader program
+if !STARPU_CROSS_COMPILING
 LOADER			=	loader
 LOADER			=	loader
 loader_CPPFLAGS =  $(AM_CFLAGS) $(AM_CPPFLAGS) -I$(top_builddir)/src/
 loader_CPPFLAGS =  $(AM_CFLAGS) $(AM_CPPFLAGS) -I$(top_builddir)/src/
 LOADER_BIN		=	$(abs_top_builddir)/examples/$(LOADER)
 LOADER_BIN		=	$(abs_top_builddir)/examples/$(LOADER)
 loader_SOURCES		=	../tests/loader.c
 loader_SOURCES		=	../tests/loader.c
+else
+LOADER			=
+LOADER_BIN		=	$(top_builddir)/tests/loader-cross.sh
+endif
 
 
 if STARPU_HAVE_AM111
 if STARPU_HAVE_AM111
 TESTS_ENVIRONMENT	=	top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)"
 TESTS_ENVIRONMENT	=	top_builddir="$(abs_top_builddir)" top_srcdir="$(abs_top_srcdir)"
@@ -305,9 +310,12 @@ basic_examples_vector_scal_SOURCES =		\
 	basic_examples/vector_scal_cpu.c
 	basic_examples/vector_scal_cpu.c
 
 
 if STARPU_HAVE_ICC
 if STARPU_HAVE_ICC
+if STARPU_CROSS_COMPILING
+else
 basic_examples_vector_scal_SOURCES +=		\
 basic_examples_vector_scal_SOURCES +=		\
 	basic_examples/vector_scal_cpu_icc.icc
 	basic_examples/vector_scal_cpu_icc.icc
 endif
 endif
+endif
 
 
 if STARPU_USE_CUDA
 if STARPU_USE_CUDA
 basic_examples_vector_scal_SOURCES +=		\
 basic_examples_vector_scal_SOURCES +=		\

+ 3 - 2
examples/basic_examples/mult.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2010-2011, 2013  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  *
@@ -77,7 +77,7 @@ static unsigned zdim = 512;
  * registered data with the "matrix" data interface, we use the matrix macros.
  * registered data with the "matrix" data interface, we use the matrix macros.
  */
  */
 
 
-static void cpu_mult(void *descr[], __attribute__((unused))  void *arg)
+void cpu_mult(void *descr[], __attribute__((unused))  void *arg)
 {
 {
 	float *subA, *subB, *subC;
 	float *subA, *subB, *subC;
 	uint32_t nxC, nyC, nyA;
 	uint32_t nxC, nyC, nyA;
@@ -264,6 +264,7 @@ static struct starpu_codelet cl =
         /* we can only execute that kernel on a CPU yet */
         /* we can only execute that kernel on a CPU yet */
         /* CPU implementation of the codelet */
         /* CPU implementation of the codelet */
         .cpu_funcs = {cpu_mult, NULL},
         .cpu_funcs = {cpu_mult, NULL},
+        .cpu_funcs_name = {"cpu_mult", NULL},
         /* the codelet manipulates 3 buffers that are managed by the
         /* the codelet manipulates 3 buffers that are managed by the
          * DSM */
          * DSM */
         .nbuffers = 3,
         .nbuffers = 3,

+ 17 - 4
examples/basic_examples/vector_scal.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
- * Copyright (C) 2010-2012  Université de Bordeaux 1
+ * Copyright (C) 2010-2013  Université de Bordeaux 1
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -52,21 +52,34 @@ static struct starpu_perfmodel vector_scal_power_model =
 
 
 static struct starpu_codelet cl =
 static struct starpu_codelet cl =
 {
 {
-	.where = STARPU_CPU | STARPU_CUDA | STARPU_OPENCL,
+	.where = STARPU_CPU | STARPU_CUDA | STARPU_OPENCL | STARPU_MIC,
 	/* CPU implementation of the codelet */
 	/* CPU implementation of the codelet */
 	.cpu_funcs = {
 	.cpu_funcs = {
 		scal_cpu_func
 		scal_cpu_func
-#ifdef STARPU_HAVE_ICC
+#if defined(STARPU_HAVE_ICC) && !defined(__KNC__) && !defined(__KNF__)
 		, scal_cpu_func_icc
 		, scal_cpu_func_icc
 #endif
 #endif
 #ifdef __SSE__
 #ifdef __SSE__
 		, scal_sse_func
 		, scal_sse_func
-#ifdef STARPU_HAVE_ICC
+#if defined(STARPU_HAVE_ICC) && !defined(__KNC__) && !defined(__KNF__)
 		, scal_sse_func_icc
 		, scal_sse_func_icc
 #endif
 #endif
 #endif
 #endif
 		, NULL
 		, NULL
 	},
 	},
+	.cpu_funcs_name = {
+		"scal_cpu_func",
+#if defined(STARPU_HAVE_ICC) && !defined(__KNC__) && !defined(__KNF__)
+		"scal_cpu_func_icc",
+#endif
+#ifdef __SSE__
+		"scal_sse_func",
+#if defined(STARPU_HAVE_ICC) && !defined(__KNC__) && !defined(__KNF__)
+		"scal_sse_func_icc"
+#endif
+#endif
+	},
+
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 	/* CUDA implementation of the codelet */
 	/* CUDA implementation of the codelet */
 	.cuda_funcs = {scal_cuda_func, NULL},
 	.cuda_funcs = {scal_cuda_func, NULL},

+ 2 - 1
examples/basic_examples/vector_scal_c.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
  * Copyright (C) 2010, 2011, 2012  Centre National de la Recherche Scientifique
- * Copyright (C) 2011  Université de Bordeaux 1
+ * Copyright (C) 2011, 2013  Université de Bordeaux 1
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -43,6 +43,7 @@ static struct starpu_codelet cl =
 	.modes = { STARPU_RW },
 	.modes = { STARPU_RW },
 	/* CPU implementation of the codelet */
 	/* CPU implementation of the codelet */
 	.cpu_funcs = {scal_cpu_func, NULL},
 	.cpu_funcs = {scal_cpu_func, NULL},
+	.cpu_funcs_name = {"scal_cpu_func", NULL},
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 	/* CUDA implementation of the codelet */
 	/* CUDA implementation of the codelet */
 	.cuda_funcs = {scal_cuda_func, NULL},
 	.cuda_funcs = {scal_cuda_func, NULL},

+ 2 - 0
examples/filters/fblock.c

@@ -92,6 +92,7 @@ int main(int argc, char **argv)
 	struct starpu_codelet cl =
 	struct starpu_codelet cl =
 	{
 	{
                 .cpu_funcs = {cpu_func, NULL},
                 .cpu_funcs = {cpu_func, NULL},
+                .cpu_funcs_name = {"cpu_func", NULL},
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
                 .cuda_funcs = {cuda_func, NULL},
                 .cuda_funcs = {cuda_func, NULL},
 #endif
 #endif
@@ -147,6 +148,7 @@ int main(int argc, char **argv)
                 task->callback_func = NULL;
                 task->callback_func = NULL;
                 task->handles[0] = starpu_data_get_sub_data(handle, 1, i);
                 task->handles[0] = starpu_data_get_sub_data(handle, 1, i);
                 task->cl_arg = &multiplier;
                 task->cl_arg = &multiplier;
+                task->cl_arg_size = sizeof(multiplier);
 
 
                 ret = starpu_task_submit(task);
                 ret = starpu_task_submit(task);
                 if (ret)
                 if (ret)

+ 1 - 0
examples/filters/fmatrix.c

@@ -63,6 +63,7 @@ int main(int argc, char **argv)
         struct starpu_codelet cl =
         struct starpu_codelet cl =
 	{
 	{
                 .cpu_funcs = {cpu_func, NULL},
                 .cpu_funcs = {cpu_func, NULL},
+                .cpu_funcs_name = {"cpu_func", NULL},
                 .nbuffers = 1,
                 .nbuffers = 1,
 		.modes = {STARPU_RW},
 		.modes = {STARPU_RW},
 		.name = "matrix_scal"
 		.name = "matrix_scal"

+ 1 - 0
examples/filters/fvector.c

@@ -46,6 +46,7 @@ int main(int argc, char **argv)
         struct starpu_codelet cl =
         struct starpu_codelet cl =
 	{
 	{
                 .cpu_funcs = {cpu_func, NULL},
                 .cpu_funcs = {cpu_func, NULL},
+                .cpu_funcs_name = {"cpu_func", NULL},
                 .nbuffers = 1,
                 .nbuffers = 1,
 		.modes = {STARPU_RW},
 		.modes = {STARPU_RW},
 		.name = "vector_scal"
 		.name = "vector_scal"

+ 2 - 1
examples/filters/shadow.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2012  Université de Bordeaux 1
+ * Copyright (C) 2012-2013  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -100,6 +100,7 @@ int main(int argc, char **argv)
         struct starpu_codelet cl =
         struct starpu_codelet cl =
 	{
 	{
                 .cpu_funcs = {cpu_func, NULL},
                 .cpu_funcs = {cpu_func, NULL},
+                .cpu_funcs_name = {"cpu_func", NULL},
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
                 .cuda_funcs = {cuda_func, NULL},
                 .cuda_funcs = {cuda_func, NULL},
 #endif
 #endif

+ 2 - 1
examples/filters/shadow2d.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2012  Université de Bordeaux 1
+ * Copyright (C) 2012-2013  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -154,6 +154,7 @@ int main(int argc, char **argv)
         struct starpu_codelet cl =
         struct starpu_codelet cl =
 	{
 	{
                 .cpu_funcs = {cpu_func, NULL},
                 .cpu_funcs = {cpu_func, NULL},
+                .cpu_funcs_name = {"cpu_func", NULL},
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
                 .cuda_funcs = {cuda_func, NULL},
                 .cuda_funcs = {cuda_func, NULL},
 #endif
 #endif

+ 2 - 1
examples/filters/shadow3d.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2012  Université de Bordeaux 1
+ * Copyright (C) 2012-2013  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -121,6 +121,7 @@ int main(int argc, char **argv)
         struct starpu_codelet cl =
         struct starpu_codelet cl =
 	{
 	{
                 .cpu_funcs = {cpu_func, NULL},
                 .cpu_funcs = {cpu_func, NULL},
+                .cpu_funcs_name = {"cpu_func", NULL},
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
                 .cuda_funcs = {cuda_func, NULL},
                 .cuda_funcs = {cuda_func, NULL},
 #endif
 #endif

+ 1 - 0
examples/incrementer/incrementer.c

@@ -66,6 +66,7 @@ int main(int argc, char **argv)
 	struct starpu_codelet cl =
 	struct starpu_codelet cl =
 	{
 	{
 		.cpu_funcs = {cpu_codelet, NULL},
 		.cpu_funcs = {cpu_codelet, NULL},
+		.cpu_funcs_name = {"cpu_codelet", NULL},
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 		.cuda_funcs = {cuda_codelet, NULL},
 		.cuda_funcs = {cuda_codelet, NULL},
 #endif
 #endif

+ 2 - 0
examples/interface/complex_codelet.h

@@ -51,6 +51,7 @@ void compare_complex_codelet(void *descr[], void *_args)
 struct starpu_codelet cl_compare =
 struct starpu_codelet cl_compare =
 {
 {
 	.cpu_funcs = {compare_complex_codelet, NULL},
 	.cpu_funcs = {compare_complex_codelet, NULL},
+	.cpu_funcs_name = {"compare_complex_codelet", NULL},
 	.nbuffers = 2,
 	.nbuffers = 2,
 	.modes = {STARPU_R, STARPU_R},
 	.modes = {STARPU_R, STARPU_R},
 	.name = "cl_compare"
 	.name = "cl_compare"
@@ -76,6 +77,7 @@ void display_complex_codelet(void *descr[], void *_args)
 struct starpu_codelet cl_display =
 struct starpu_codelet cl_display =
 {
 {
 	.cpu_funcs = {display_complex_codelet, NULL},
 	.cpu_funcs = {display_complex_codelet, NULL},
+	.cpu_funcs_name = {"display_complex_codelet", NULL},
 	.nbuffers = 1,
 	.nbuffers = 1,
 	.modes = {STARPU_R},
 	.modes = {STARPU_R},
 	.name = "cl_display"
 	.name = "cl_display"

+ 2 - 1
examples/openmp/vector_scal_omp.c

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
- * Copyright (C) 2010-2012  Université de Bordeaux 1
+ * Copyright (C) 2010-2013  Université de Bordeaux 1
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -65,6 +65,7 @@ static struct starpu_codelet cl =
 	.type = STARPU_FORKJOIN,
 	.type = STARPU_FORKJOIN,
 	.max_parallelism = INT_MAX,
 	.max_parallelism = INT_MAX,
 	.cpu_funcs = {scal_cpu_func, NULL},
 	.cpu_funcs = {scal_cpu_func, NULL},
+	.cpu_funcs_name = {"scal_cpu_func", NULL},
 	.nbuffers = 1,
 	.nbuffers = 1,
 	.model = &vector_scal_model,
 	.model = &vector_scal_model,
 };
 };

+ 2 - 1
examples/pi/pi.c

@@ -32,7 +32,7 @@ static unsigned ntasks = 1024;
 
 
 static unsigned long long nshot_per_task = 16*1024*1024ULL;
 static unsigned long long nshot_per_task = 16*1024*1024ULL;
 
 
-static void cpu_kernel(void *descr[], void *cl_arg)
+void cpu_kernel(void *descr[], void *cl_arg)
 {
 {
 	unsigned *directions = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
 	unsigned *directions = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
 	unsigned nx = nshot_per_task;
 	unsigned nx = nshot_per_task;
@@ -107,6 +107,7 @@ static struct starpu_perfmodel model =
 static struct starpu_codelet pi_cl =
 static struct starpu_codelet pi_cl =
 {
 {
 	.cpu_funcs = {cpu_kernel, NULL},
 	.cpu_funcs = {cpu_kernel, NULL},
+	.cpu_funcs_name = {"cpu_kernel", NULL},
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {cuda_kernel, NULL},
 	.cuda_funcs = {cuda_kernel, NULL},
 #endif
 #endif

+ 7 - 3
examples/pi/pi_redux.c

@@ -138,7 +138,7 @@ static void parse_args(int argc, char **argv)
  *	Monte-carlo kernel
  *	Monte-carlo kernel
  */
  */
 
 
-static void pi_func_cpu(void *descr[], void *cl_arg __attribute__ ((unused)))
+void pi_func_cpu(void *descr[], void *cl_arg __attribute__ ((unused)))
 {
 {
 	int workerid = starpu_worker_get_id();
 	int workerid = starpu_worker_get_id();
 
 
@@ -209,6 +209,7 @@ static struct starpu_perfmodel pi_model =
 static struct starpu_codelet pi_cl =
 static struct starpu_codelet pi_cl =
 {
 {
 	.cpu_funcs = {pi_func_cpu, NULL},
 	.cpu_funcs = {pi_func_cpu, NULL},
+	.cpu_funcs_name = {"pi_func_cpu", NULL},
 #ifdef STARPU_HAVE_CURAND
 #ifdef STARPU_HAVE_CURAND
 	.cuda_funcs = {pi_func_cuda, NULL},
 	.cuda_funcs = {pi_func_cuda, NULL},
 #endif
 #endif
@@ -227,6 +228,7 @@ static struct starpu_perfmodel pi_model_redux =
 static struct starpu_codelet pi_cl_redux =
 static struct starpu_codelet pi_cl_redux =
 {
 {
 	.cpu_funcs = {pi_func_cpu, NULL},
 	.cpu_funcs = {pi_func_cpu, NULL},
+	.cpu_funcs_name = {"pi_func_cpu", NULL},
 #ifdef STARPU_HAVE_CURAND
 #ifdef STARPU_HAVE_CURAND
 	.cuda_funcs = {pi_func_cuda, NULL},
 	.cuda_funcs = {pi_func_cuda, NULL},
 #endif
 #endif
@@ -239,7 +241,7 @@ static struct starpu_codelet pi_cl_redux =
  *	Codelets to implement reduction
  *	Codelets to implement reduction
  */
  */
 
 
-static void init_cpu_func(void *descr[], void *cl_arg)
+void init_cpu_func(void *descr[], void *cl_arg)
 {
 {
         unsigned long *val = (unsigned long *)STARPU_VARIABLE_GET_PTR(descr[0]);
         unsigned long *val = (unsigned long *)STARPU_VARIABLE_GET_PTR(descr[0]);
         *val = 0;
         *val = 0;
@@ -257,6 +259,7 @@ static void init_cuda_func(void *descr[], void *cl_arg)
 static struct starpu_codelet init_codelet =
 static struct starpu_codelet init_codelet =
 {
 {
         .cpu_funcs = {init_cpu_func, NULL},
         .cpu_funcs = {init_cpu_func, NULL},
+        .cpu_funcs_name = {"init_cpu_func", NULL},
 #ifdef STARPU_HAVE_CURAND
 #ifdef STARPU_HAVE_CURAND
         .cuda_funcs = {init_cuda_func, NULL},
         .cuda_funcs = {init_cuda_func, NULL},
 #endif
 #endif
@@ -284,7 +287,7 @@ static void redux_cuda_func(void *descr[], void *cl_arg)
 }
 }
 #endif
 #endif
 
 
-static void redux_cpu_func(void *descr[], void *cl_arg)
+void redux_cpu_func(void *descr[], void *cl_arg)
 {
 {
 	unsigned long *a = (unsigned long *)STARPU_VARIABLE_GET_PTR(descr[0]);
 	unsigned long *a = (unsigned long *)STARPU_VARIABLE_GET_PTR(descr[0]);
 	unsigned long *b = (unsigned long *)STARPU_VARIABLE_GET_PTR(descr[1]);
 	unsigned long *b = (unsigned long *)STARPU_VARIABLE_GET_PTR(descr[1]);
@@ -295,6 +298,7 @@ static void redux_cpu_func(void *descr[], void *cl_arg)
 static struct starpu_codelet redux_codelet =
 static struct starpu_codelet redux_codelet =
 {
 {
 	.cpu_funcs = {redux_cpu_func, NULL},
 	.cpu_funcs = {redux_cpu_func, NULL},
+	.cpu_funcs_name = {"redux_cpu_func", NULL},
 #ifdef STARPU_HAVE_CURAND
 #ifdef STARPU_HAVE_CURAND
 	.cuda_funcs = {redux_cuda_func, NULL},
 	.cuda_funcs = {redux_cuda_func, NULL},
 #endif
 #endif

+ 3 - 2
examples/ppm_downscaler/yuv_downscaler.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2010-2011  Université de Bordeaux 1
+ * Copyright (C) 2010-2011, 2013  Université de Bordeaux 1
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010  Mehdi Juhoor <mjuhoor@gmail.com>
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  *
@@ -52,7 +52,7 @@ void parse_args(int argc, char **argv)
 #define FRAMESIZE	sizeof(struct yuv_frame)
 #define FRAMESIZE	sizeof(struct yuv_frame)
 #define NEW_FRAMESIZE	sizeof(struct yuv_new_frame)
 #define NEW_FRAMESIZE	sizeof(struct yuv_new_frame)
 
 
-static void ds_kernel_cpu(void *descr[], __attribute__((unused)) void *arg)
+void ds_kernel_cpu(void *descr[], __attribute__((unused)) void *arg)
 {
 {
 	uint8_t *input = (uint8_t *)STARPU_MATRIX_GET_PTR(descr[0]);
 	uint8_t *input = (uint8_t *)STARPU_MATRIX_GET_PTR(descr[0]);
 	const unsigned input_ld = STARPU_MATRIX_GET_LD(descr[0]);
 	const unsigned input_ld = STARPU_MATRIX_GET_LD(descr[0]);
@@ -86,6 +86,7 @@ static void ds_kernel_cpu(void *descr[], __attribute__((unused)) void *arg)
 static struct starpu_codelet ds_codelet =
 static struct starpu_codelet ds_codelet =
 {
 {
 	.cpu_funcs = {ds_kernel_cpu, NULL},
 	.cpu_funcs = {ds_kernel_cpu, NULL},
+	.cpu_funcs_name = {"ds_kernel_cpu", NULL},
 	.nbuffers = 2, /* input -> output */
 	.nbuffers = 2, /* input -> output */
 	.modes = {STARPU_R, STARPU_W},
 	.modes = {STARPU_R, STARPU_W},
 	.model = NULL
 	.model = NULL

+ 3 - 0
examples/reductions/dot_product.c

@@ -108,6 +108,7 @@ static struct starpu_codelet init_codelet =
 {
 {
 	.can_execute = can_execute,
 	.can_execute = can_execute,
 	.cpu_funcs = {init_cpu_func, NULL},
 	.cpu_funcs = {init_cpu_func, NULL},
+	.cpu_funcs_name = {"init_cpu_func", NULL},
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {init_cuda_func, NULL},
 	.cuda_funcs = {init_cuda_func, NULL},
 #endif
 #endif
@@ -190,6 +191,7 @@ static struct starpu_codelet redux_codelet =
 {
 {
 	.can_execute = can_execute,
 	.can_execute = can_execute,
 	.cpu_funcs = {redux_cpu_func, NULL},
 	.cpu_funcs = {redux_cpu_func, NULL},
+	.cpu_funcs_name = {"redux_cpu_func", NULL},
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {redux_cuda_func, NULL},
 	.cuda_funcs = {redux_cuda_func, NULL},
 #endif
 #endif
@@ -308,6 +310,7 @@ static struct starpu_codelet dot_codelet =
 {
 {
 	.can_execute = can_execute,
 	.can_execute = can_execute,
 	.cpu_funcs = {dot_cpu_func, NULL},
 	.cpu_funcs = {dot_cpu_func, NULL},
+	.cpu_funcs_name = {"dot_cpu_func", NULL},
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {dot_cuda_func, NULL},
 	.cuda_funcs = {dot_cuda_func, NULL},
 #endif
 #endif

+ 4 - 1
examples/reductions/minmax_reduction.c

@@ -44,7 +44,7 @@ static starpu_data_handle_t _minmax_handle;
  *	Codelet to create a neutral element
  *	Codelet to create a neutral element
  */
  */
 
 
-static void minmax_neutral_cpu_func(void *descr[], void *cl_arg)
+void minmax_neutral_cpu_func(void *descr[], void *cl_arg)
 {
 {
 	TYPE *array = (TYPE *)STARPU_VARIABLE_GET_PTR(descr[0]);
 	TYPE *array = (TYPE *)STARPU_VARIABLE_GET_PTR(descr[0]);
 
 
@@ -58,6 +58,7 @@ static void minmax_neutral_cpu_func(void *descr[], void *cl_arg)
 static struct starpu_codelet minmax_init_codelet =
 static struct starpu_codelet minmax_init_codelet =
 {
 {
 	.cpu_funcs = {minmax_neutral_cpu_func, NULL},
 	.cpu_funcs = {minmax_neutral_cpu_func, NULL},
+	.cpu_funcs_name = {"minmax_neutral_cpu_func", NULL},
 	.modes = {STARPU_W},
 	.modes = {STARPU_W},
 	.nbuffers = 1,
 	.nbuffers = 1,
 	.name = "init"
 	.name = "init"
@@ -86,6 +87,7 @@ void minmax_redux_cpu_func(void *descr[], void *cl_arg)
 static struct starpu_codelet minmax_redux_codelet =
 static struct starpu_codelet minmax_redux_codelet =
 {
 {
 	.cpu_funcs = {minmax_redux_cpu_func, NULL},
 	.cpu_funcs = {minmax_redux_cpu_func, NULL},
+	.cpu_funcs_name = {"minmax_redux_cpu_func", NULL},
 	.modes = {STARPU_RW, STARPU_R},
 	.modes = {STARPU_RW, STARPU_R},
 	.nbuffers = 2,
 	.nbuffers = 2,
 	.name = "redux"
 	.name = "redux"
@@ -122,6 +124,7 @@ void minmax_cpu_func(void *descr[], void *cl_arg)
 static struct starpu_codelet minmax_codelet =
 static struct starpu_codelet minmax_codelet =
 {
 {
 	.cpu_funcs = {minmax_cpu_func, NULL},
 	.cpu_funcs = {minmax_cpu_func, NULL},
+	.cpu_funcs_name = {"minmax_cpu_func", NULL},
 	.nbuffers = 2,
 	.nbuffers = 2,
 	.modes = {STARPU_R, STARPU_REDUX},
 	.modes = {STARPU_R, STARPU_REDUX},
 	.name = "minmax"
 	.name = "minmax"

+ 1 - 0
examples/spmd/vector_scal_spmd.c

@@ -84,6 +84,7 @@ static struct starpu_codelet cl =
 	.type = STARPU_SPMD,
 	.type = STARPU_SPMD,
 	.max_parallelism = INT_MAX,
 	.max_parallelism = INT_MAX,
 	.cpu_funcs = {scal_cpu_func, NULL},
 	.cpu_funcs = {scal_cpu_func, NULL},
+	.cpu_funcs_name = {"scal_cpu_func", NULL},
 	.nbuffers = 1,
 	.nbuffers = 1,
 	.model = &vector_scal_model,
 	.model = &vector_scal_model,
 };
 };

+ 1 - 1
examples/stencil/Makefile.am

@@ -16,7 +16,7 @@
 AM_CFLAGS = $(HWLOC_CFLAGS) $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(GLOBAL_AM_CFLAGS) -Wno-unused
 AM_CFLAGS = $(HWLOC_CFLAGS) $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(GLOBAL_AM_CFLAGS) -Wno-unused
 LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ $(HWLOC_LIBS) @LIBS@
 LIBS = $(top_builddir)/src/@LIBSTARPU_LINK@ $(HWLOC_LIBS) @LIBS@
 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/examples/ -I$(top_builddir)/include
 AM_CPPFLAGS = -I$(top_srcdir)/include/ -I$(top_srcdir)/examples/ -I$(top_builddir)/include
-AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS)
+AM_LDFLAGS = $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(STARPU_COI_LDFLAGS)
 
 
 if USE_MPI
 if USE_MPI
 LIBS += $(top_builddir)/mpi/src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la
 LIBS += $(top_builddir)/mpi/src/libstarpumpi-@STARPU_EFFECTIVE_VERSION@.la

+ 7 - 4
examples/stencil/stencil-kernels.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2010-2012  Université de Bordeaux 1
+ * Copyright (C) 2010-2013  Université de Bordeaux 1
  * Copyright (C) 2012  Centre National de la Recherche Scientifique
  * Copyright (C) 2012  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -376,7 +376,7 @@ fprintf(stderr,"!!! DO update_func_opencl z %d OPENCL%d !!!\n", block->bz, worke
 /*
 /*
  * cl_update (CPU version)
  * cl_update (CPU version)
  */
  */
-static void update_func_cpu(void *descr[], void *arg)
+void update_func_cpu(void *descr[], void *arg)
 {
 {
 	struct block_description *block = (struct block_description *) arg;
 	struct block_description *block = (struct block_description *) arg;
 	int workerid = starpu_worker_get_id();
 	int workerid = starpu_worker_get_id();
@@ -457,6 +457,7 @@ static struct starpu_perfmodel cl_update_model =
 struct starpu_codelet cl_update =
 struct starpu_codelet cl_update =
 {
 {
 	.cpu_funcs = {update_func_cpu, NULL},
 	.cpu_funcs = {update_func_cpu, NULL},
+	.cpu_funcs_name = {"update_func_cpu", NULL},
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {update_func_cuda, NULL},
 	.cuda_funcs = {update_func_cuda, NULL},
 #endif
 #endif
@@ -541,7 +542,7 @@ unsigned top_per_worker[STARPU_NMAXWORKERS];
 unsigned bottom_per_worker[STARPU_NMAXWORKERS];
 unsigned bottom_per_worker[STARPU_NMAXWORKERS];
 
 
 /* top save, CPU version */
 /* top save, CPU version */
-static void dummy_func_top_cpu(void *descr[] __attribute__((unused)), void *arg)
+void dummy_func_top_cpu(void *descr[] __attribute__((unused)), void *arg)
 {
 {
 	struct block_description *block = (struct block_description *) arg;
 	struct block_description *block = (struct block_description *) arg;
 	int workerid = starpu_worker_get_id();
 	int workerid = starpu_worker_get_id();
@@ -557,7 +558,7 @@ static void dummy_func_top_cpu(void *descr[] __attribute__((unused)), void *arg)
 }
 }
 
 
 /* bottom save, CPU version */
 /* bottom save, CPU version */
-static void dummy_func_bottom_cpu(void *descr[] __attribute__((unused)), void *arg)
+void dummy_func_bottom_cpu(void *descr[] __attribute__((unused)), void *arg)
 {
 {
 	struct block_description *block = (struct block_description *) arg;
 	struct block_description *block = (struct block_description *) arg;
 	int workerid = starpu_worker_get_id();
 	int workerid = starpu_worker_get_id();
@@ -657,6 +658,7 @@ static struct starpu_perfmodel save_cl_top_model =
 struct starpu_codelet save_cl_bottom =
 struct starpu_codelet save_cl_bottom =
 {
 {
 	.cpu_funcs = {dummy_func_bottom_cpu, NULL},
 	.cpu_funcs = {dummy_func_bottom_cpu, NULL},
+	.cpu_funcs_name = {"dummy_func_bottom_cpu", NULL},
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {dummy_func_bottom_cuda, NULL},
 	.cuda_funcs = {dummy_func_bottom_cuda, NULL},
 #endif
 #endif
@@ -671,6 +673,7 @@ struct starpu_codelet save_cl_bottom =
 struct starpu_codelet save_cl_top =
 struct starpu_codelet save_cl_top =
 {
 {
 	.cpu_funcs = {dummy_func_top_cpu, NULL},
 	.cpu_funcs = {dummy_func_top_cpu, NULL},
+	.cpu_funcs_name = {"dummy_func_top_cpu", NULL},
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 	.cuda_funcs = {dummy_func_top_cuda, NULL},
 	.cuda_funcs = {dummy_func_top_cuda, NULL},
 #endif
 #endif

+ 3 - 2
examples/stencil/stencil-tasks.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2010  Université de Bordeaux 1
+ * Copyright (C) 2010, 2013  Université de Bordeaux 1
  * Copyright (C) 2012, 2013  Centre National de la Recherche Scientifique
  * Copyright (C) 2012, 2013  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -213,11 +213,12 @@ void create_task_update(unsigned iter, unsigned z, int local_rank)
 }
 }
 
 
 /* Dummy empty codelet taking one buffer */
 /* Dummy empty codelet taking one buffer */
-static void null_func(void *descr[] __attribute__((unused)), void *arg __attribute__((unused))) { }
+void null_func(void *descr[] __attribute__((unused)), void *arg __attribute__((unused))) { }
 static struct starpu_codelet null =
 static struct starpu_codelet null =
 {
 {
 	.modes = { STARPU_W, STARPU_W },
 	.modes = { STARPU_W, STARPU_W },
 	.cpu_funcs = {null_func, NULL},
 	.cpu_funcs = {null_func, NULL},
+	.cpu_funcs_name = {"null_func", NULL},
 	.cuda_funcs = {null_func, NULL},
 	.cuda_funcs = {null_func, NULL},
 	.opencl_funcs = {null_func, NULL},
 	.opencl_funcs = {null_func, NULL},
 	.nbuffers = 2
 	.nbuffers = 2

+ 25 - 0
include/starpu.h

@@ -92,6 +92,10 @@ struct starpu_conf
 	int ncuda;
 	int ncuda;
 	/* number of GPU OpenCL device workers (-1 for default) */
 	/* number of GPU OpenCL device workers (-1 for default) */
 	int nopencl;
 	int nopencl;
+	/* number of MIC device workers (-1 for default) */
+	int nmic;
+	/* number of SCC device workers (-1 for default) */
+	int nscc;
 
 
 	unsigned use_explicit_workers_bindid;
 	unsigned use_explicit_workers_bindid;
 	unsigned workers_bindid[STARPU_NMAXWORKERS];
 	unsigned workers_bindid[STARPU_NMAXWORKERS];
@@ -102,6 +106,12 @@ struct starpu_conf
 	unsigned use_explicit_workers_opencl_gpuid;
 	unsigned use_explicit_workers_opencl_gpuid;
 	unsigned workers_opencl_gpuid[STARPU_NMAXWORKERS];
 	unsigned workers_opencl_gpuid[STARPU_NMAXWORKERS];
 
 
+	unsigned use_explicit_workers_mic_deviceid;
+	unsigned workers_mic_deviceid[STARPU_NMAXWORKERS];
+
+	unsigned use_explicit_workers_scc_deviceid;
+	unsigned workers_scc_deviceid[STARPU_NMAXWORKERS];
+
 	/* calibrate bus (-1 for default) */
 	/* calibrate bus (-1 for default) */
 	int bus_calibrate;
 	int bus_calibrate;
 
 
@@ -111,6 +121,10 @@ struct starpu_conf
 	/* Create only one combined worker, containing all CPU workers */
 	/* Create only one combined worker, containing all CPU workers */
 	int single_combined_worker;
 	int single_combined_worker;
 
 
+	/* Path to the kernel to execute on the MIC device, compiled
+	 * for MIC architecture. */
+	char *mic_sink_program_path;
+
 	/* indicate if all asynchronous copies should be disabled */
 	/* indicate if all asynchronous copies should be disabled */
 	int disable_asynchronous_copy;
 	int disable_asynchronous_copy;
 
 
@@ -120,6 +134,9 @@ struct starpu_conf
 	/* indicate if asynchronous copies to OpenCL devices should be disabled */
 	/* indicate if asynchronous copies to OpenCL devices should be disabled */
 	int disable_asynchronous_opencl_copy;
 	int disable_asynchronous_opencl_copy;
 
 
+	/* indicate if asynchronous copies to MIC devices should be disabled */
+	int disable_asynchronous_mic_copy;
+
 	/* Enable CUDA/OpenGL interoperation on these CUDA devices */
 	/* Enable CUDA/OpenGL interoperation on these CUDA devices */
 	unsigned *cuda_opengl_interoperability;
 	unsigned *cuda_opengl_interoperability;
 	unsigned n_cuda_opengl_interoperability;
 	unsigned n_cuda_opengl_interoperability;
@@ -140,6 +157,12 @@ int starpu_conf_init(struct starpu_conf *conf);
  */
  */
 int starpu_init(struct starpu_conf *conf) STARPU_WARN_UNUSED_RESULT;
 int starpu_init(struct starpu_conf *conf) STARPU_WARN_UNUSED_RESULT;
 
 
+/* Alternative initialization method with argc and argv. This is use by
+ * MIC, MPI, and SCC implementation.
+ * Don't call starpu_init and starpu_initialize in the same program.
+ */
+int starpu_initialize(struct starpu_conf *user_conf, int *argc, char ***argv);
+
 /* Shutdown method: note that statistics are only generated once StarPU is
 /* Shutdown method: note that statistics are only generated once StarPU is
  * shutdown */
  * shutdown */
 void starpu_shutdown(void);
 void starpu_shutdown(void);
@@ -156,6 +179,8 @@ void starpu_display_stats();
 
 
 void starpu_get_version(int *major, int *minor, int *release);
 void starpu_get_version(int *major, int *minor, int *release);
 
 
+int starpu_worker_get_mp_nodeid(int id);
+
 #ifdef __cplusplus
 #ifdef __cplusplus
 }
 }
 #endif
 #endif

+ 5 - 0
include/starpu_config.h.in

@@ -25,6 +25,8 @@
 #undef STARPU_USE_CPU
 #undef STARPU_USE_CPU
 #undef STARPU_USE_CUDA
 #undef STARPU_USE_CUDA
 #undef STARPU_USE_OPENCL
 #undef STARPU_USE_OPENCL
+#undef STARPU_USE_MIC
+#undef STARPU_USE_SCC
 
 
 #undef STARPU_SIMGRID
 #undef STARPU_SIMGRID
 
 
@@ -70,9 +72,12 @@
 #undef STARPU_MAXCPUS
 #undef STARPU_MAXCPUS
 #undef STARPU_MAXCUDADEVS
 #undef STARPU_MAXCUDADEVS
 #undef STARPU_MAXOPENCLDEVS
 #undef STARPU_MAXOPENCLDEVS
+#undef STARPU_MAXMICDEVS
+#undef STARPU_MAXSCCDEVS
 #undef STARPU_NMAXWORKERS
 #undef STARPU_NMAXWORKERS
 #undef STARPU_NMAX_SCHED_CTXS
 #undef STARPU_NMAX_SCHED_CTXS
 #undef STARPU_MAXIMPLEMENTATIONS
 #undef STARPU_MAXIMPLEMENTATIONS
+#undef STARPU_MAXMPKERNELS
 #undef STARPU_USE_SC_HYPERVISOR
 #undef STARPU_USE_SC_HYPERVISOR
 #undef STARPU_HAVE_GLPK_H
 #undef STARPU_HAVE_GLPK_H
 
 

+ 11 - 2
include/starpu_data.h

@@ -35,7 +35,9 @@ enum starpu_data_access_mode
 	STARPU_W=(1<<1),
 	STARPU_W=(1<<1),
 	STARPU_RW=(STARPU_R|STARPU_W),
 	STARPU_RW=(STARPU_R|STARPU_W),
 	STARPU_SCRATCH=(1<<2),
 	STARPU_SCRATCH=(1<<2),
-	STARPU_REDUX=(1<<3)
+	STARPU_REDUX=(1<<3),
+	STARPU_COMMUTE=(1<<4)
+	/* Note: other STARPU_* values in include/starpu_task_util.h */
 };
 };
 
 
 struct starpu_data_descr
 struct starpu_data_descr
@@ -102,7 +104,14 @@ enum starpu_node_kind
 	STARPU_UNUSED     = 0x00,
 	STARPU_UNUSED     = 0x00,
 	STARPU_CPU_RAM    = 0x01,
 	STARPU_CPU_RAM    = 0x01,
 	STARPU_CUDA_RAM   = 0x02,
 	STARPU_CUDA_RAM   = 0x02,
-	STARPU_OPENCL_RAM = 0x03
+	STARPU_OPENCL_RAM = 0x03,
+	STARPU_MIC_RAM    = 0x05,
+
+	/* This node kind is not used anymore, but implementations in interfaces
+	 * will be useful for MPI. */
+	STARPU_SCC_RAM    = 0x06,
+
+	STARPU_SCC_SHM    = 0x07
 };
 };
 
 
 unsigned starpu_worker_get_memory_node(unsigned workerid);
 unsigned starpu_worker_get_memory_node(unsigned workerid);

+ 39 - 2
include/starpu_data_interfaces.h

@@ -45,6 +45,7 @@ struct starpu_data_copy_methods
 	int (*ram_to_ram)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
 	int (*ram_to_ram)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
 	int (*ram_to_cuda)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
 	int (*ram_to_cuda)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
 	int (*ram_to_opencl)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
 	int (*ram_to_opencl)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
+	int (*ram_to_mic)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
 
 
 	/* src type is cuda */
 	/* src type is cuda */
 	int (*cuda_to_ram)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
 	int (*cuda_to_ram)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
@@ -56,6 +57,14 @@ struct starpu_data_copy_methods
 	int (*opencl_to_cuda)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
 	int (*opencl_to_cuda)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
 	int (*opencl_to_opencl)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
 	int (*opencl_to_opencl)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
 
 
+	/* src type is mic */
+	int (*mic_to_ram)(void *src_interface, unsigned srd_node, void *dst_interface, unsigned dst_node);
+
+	/* scc case */
+	int (*scc_src_to_sink)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
+	int (*scc_sink_to_src)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
+	int (*scc_sink_to_sink)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
+
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
 	/* for asynchronous CUDA transfers */
 	/* for asynchronous CUDA transfers */
 	int (*ram_to_cuda_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream);
 	int (*ram_to_cuda_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cudaStream_t stream);
@@ -74,6 +83,12 @@ struct starpu_data_copy_methods
 	int (*opencl_to_opencl_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cl_event *event);
 	int (*opencl_to_opencl_async)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, cl_event *event);
 #endif
 #endif
 
 
+#ifdef STARPU_USE_MIC
+	/* Asynchronous MIC transfers */
+	int (*ram_to_mic_async)(void *src_intreface, unsigned src_node, void *dst_interface, unsigned dst_node);
+	int (*mic_to_ram_async)(void *src_interface, unsigned srd_node, void *dst_interface, unsigned dst_node);
+#endif
+
 	int (*any_to_any)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
 	int (*any_to_any)(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
 };
 };
 
 
@@ -162,6 +177,8 @@ extern struct starpu_data_interface_ops starpu_interface_matrix_ops;
 /* Matrix interface for dense matrices */
 /* Matrix interface for dense matrices */
 struct starpu_matrix_interface
 struct starpu_matrix_interface
 {
 {
+	enum starpu_data_interface_id id;
+
 	uintptr_t ptr;
 	uintptr_t ptr;
 	uintptr_t dev_handle;
 	uintptr_t dev_handle;
 	size_t offset;
 	size_t offset;
@@ -192,6 +209,8 @@ size_t starpu_matrix_get_elemsize(starpu_data_handle_t handle);
  */
  */
 struct starpu_coo_interface
 struct starpu_coo_interface
 {
 {
+	enum starpu_data_interface_id id;
+
 	uint32_t  *columns;
 	uint32_t  *columns;
 	uint32_t  *rows;
 	uint32_t  *rows;
 	uintptr_t values;
 	uintptr_t values;
@@ -229,6 +248,8 @@ void starpu_coo_data_register(starpu_data_handle_t *handleptr, unsigned home_nod
 /* TODO: rename to 3dmatrix? */
 /* TODO: rename to 3dmatrix? */
 struct starpu_block_interface
 struct starpu_block_interface
 {
 {
+	enum starpu_data_interface_id id;
+
 	uintptr_t ptr;
 	uintptr_t ptr;
 	uintptr_t dev_handle;
 	uintptr_t dev_handle;
 	size_t offset;
 	size_t offset;
@@ -263,6 +284,8 @@ size_t starpu_block_get_elemsize(starpu_data_handle_t handle);
 /* vector interface for contiguous (non-strided) buffers */
 /* vector interface for contiguous (non-strided) buffers */
 struct starpu_vector_interface
 struct starpu_vector_interface
 {
 {
+	enum starpu_data_interface_id id;
+
 	uintptr_t ptr;
 	uintptr_t ptr;
 	uintptr_t dev_handle;
 	uintptr_t dev_handle;
 	size_t offset;
 	size_t offset;
@@ -285,9 +308,12 @@ uintptr_t starpu_vector_get_local_ptr(starpu_data_handle_t handle);
 /* variable interface for a single data (not a vector, a matrix, a list, ...) */
 /* variable interface for a single data (not a vector, a matrix, a list, ...) */
 struct starpu_variable_interface
 struct starpu_variable_interface
 {
 {
+	enum starpu_data_interface_id id;
+
 	uintptr_t ptr;
 	uintptr_t ptr;
+	uintptr_t dev_handle;
+	size_t offset;
 	size_t elemsize;
 	size_t elemsize;
-	/* No dev_handle, since it can not be filtered, offset will always be zero */
 };
 };
 
 
 void starpu_variable_data_register(starpu_data_handle_t *handle, unsigned home_node, uintptr_t ptr, size_t size);
 void starpu_variable_data_register(starpu_data_handle_t *handle, unsigned home_node, uintptr_t ptr, size_t size);
@@ -296,10 +322,10 @@ uintptr_t starpu_variable_get_local_ptr(starpu_data_handle_t handle);
 
 
 /* helper methods */
 /* helper methods */
 #define STARPU_VARIABLE_GET_PTR(interface)	(((struct starpu_variable_interface *)(interface))->ptr)
 #define STARPU_VARIABLE_GET_PTR(interface)	(((struct starpu_variable_interface *)(interface))->ptr)
+#define STARPU_VARIABLE_GET_OFFSET(interface)	(((struct starpu_variable_interface *)(interface))->offset)
 #define STARPU_VARIABLE_GET_ELEMSIZE(interface)	(((struct starpu_variable_interface *)(interface))->elemsize)
 #define STARPU_VARIABLE_GET_ELEMSIZE(interface)	(((struct starpu_variable_interface *)(interface))->elemsize)
 #define STARPU_VARIABLE_GET_DEV_HANDLE(interface) \
 #define STARPU_VARIABLE_GET_DEV_HANDLE(interface) \
 	(((struct starpu_variable_interface *)(interface))->ptr)
 	(((struct starpu_variable_interface *)(interface))->ptr)
-#define STARPU_VARIABLE_GET_OFFSET 0
 
 
 /* void interface. There is no data really associated to that interface, but it
 /* void interface. There is no data really associated to that interface, but it
  * may be used as a synchronization mechanism. It also permits to express an
  * may be used as a synchronization mechanism. It also permits to express an
@@ -311,6 +337,8 @@ void starpu_void_data_register(starpu_data_handle_t *handle);
 /* CSR interface for sparse matrices (compressed sparse row representation) */
 /* CSR interface for sparse matrices (compressed sparse row representation) */
 struct starpu_csr_interface
 struct starpu_csr_interface
 {
 {
+	enum starpu_data_interface_id id;
+
 	uint32_t nnz; /* number of non-zero entries */
 	uint32_t nnz; /* number of non-zero entries */
 	uint32_t nrow; /* number of rows */
 	uint32_t nrow; /* number of rows */
 	uintptr_t nzval; /* non-zero values */
 	uintptr_t nzval; /* non-zero values */
@@ -352,6 +380,8 @@ size_t starpu_csr_get_elemsize(starpu_data_handle_t handle);
  * representation) */
  * representation) */
 struct starpu_bcsr_interface
 struct starpu_bcsr_interface
 {
 {
+	enum starpu_data_interface_id id;
+
 	uint32_t nnz; /* number of non-zero BLOCKS */
 	uint32_t nnz; /* number of non-zero BLOCKS */
 	uint32_t nrow; /* number of rows (in terms of BLOCKS) */
 	uint32_t nrow; /* number of rows (in terms of BLOCKS) */
 
 
@@ -406,13 +436,19 @@ struct starpu_multiformat_data_interface_ops
 	size_t cuda_elemsize;
 	size_t cuda_elemsize;
 	struct starpu_codelet *cpu_to_cuda_cl;
 	struct starpu_codelet *cpu_to_cuda_cl;
 	struct starpu_codelet *cuda_to_cpu_cl;
 	struct starpu_codelet *cuda_to_cpu_cl;
+	size_t mic_elemsize;
+	struct starpu_codelet *cpu_to_mic_cl;
+	struct starpu_codelet *mic_to_cpu_cl;
 };
 };
 
 
 struct starpu_multiformat_interface
 struct starpu_multiformat_interface
 {
 {
+	enum starpu_data_interface_id id;
+
 	void *cpu_ptr;
 	void *cpu_ptr;
 	void *cuda_ptr;
 	void *cuda_ptr;
 	void *opencl_ptr;
 	void *opencl_ptr;
+	void *mic_ptr;
 	uint32_t nx;
 	uint32_t nx;
 	struct starpu_multiformat_data_interface_ops *ops;
 	struct starpu_multiformat_data_interface_ops *ops;
 };
 };
@@ -422,6 +458,7 @@ void starpu_multiformat_data_register(starpu_data_handle_t *handle, unsigned hom
 #define STARPU_MULTIFORMAT_GET_CPU_PTR(interface)  (((struct starpu_multiformat_interface *)(interface))->cpu_ptr)
 #define STARPU_MULTIFORMAT_GET_CPU_PTR(interface)  (((struct starpu_multiformat_interface *)(interface))->cpu_ptr)
 #define STARPU_MULTIFORMAT_GET_CUDA_PTR(interface) (((struct starpu_multiformat_interface *)(interface))->cuda_ptr)
 #define STARPU_MULTIFORMAT_GET_CUDA_PTR(interface) (((struct starpu_multiformat_interface *)(interface))->cuda_ptr)
 #define STARPU_MULTIFORMAT_GET_OPENCL_PTR(interface) (((struct starpu_multiformat_interface *)(interface))->opencl_ptr)
 #define STARPU_MULTIFORMAT_GET_OPENCL_PTR(interface) (((struct starpu_multiformat_interface *)(interface))->opencl_ptr)
+#define STARPU_MULTIFORMAT_GET_MIC_PTR(interface) (((struct starpu_multiformat_interface *)(interface))->mic_ptr)
 #define STARPU_MULTIFORMAT_GET_NX(interface)  (((struct starpu_multiformat_interface *)(interface))->nx)
 #define STARPU_MULTIFORMAT_GET_NX(interface)  (((struct starpu_multiformat_interface *)(interface))->nx)
 
 
 enum starpu_data_interface_id starpu_data_get_interface_id(starpu_data_handle_t handle);
 enum starpu_data_interface_id starpu_data_get_interface_id(starpu_data_handle_t handle);

+ 35 - 0
include/starpu_mic.h

@@ -0,0 +1,35 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012  Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+
+#ifndef __STARPU_MIC_H__
+#define __STARPU_MIC_H__
+
+#include <starpu_config.h>
+
+
+#ifdef STARPU_USE_MIC
+
+typedef void *starpu_mic_func_symbol_t;
+
+int starpu_mic_register_kernel(starpu_mic_func_symbol_t *symbol, const char *func_name);
+
+starpu_mic_kernel_t starpu_mic_get_kernel(starpu_mic_func_symbol_t symbol);
+
+#endif /* STARPU_USE_MIC */
+
+
+#endif /* __STARPU_MIC_H__ */

+ 10 - 4
include/starpu_perfmodel.h

@@ -43,8 +43,10 @@ enum starpu_perfmodel_archtype
 	STARPU_CPU_DEFAULT = 0,
 	STARPU_CPU_DEFAULT = 0,
 	/* CPU combined workers between 0 and STARPU_MAXCPUS-1 */
 	/* CPU combined workers between 0 and STARPU_MAXCPUS-1 */
 	STARPU_CUDA_DEFAULT = STARPU_MAXCPUS,
 	STARPU_CUDA_DEFAULT = STARPU_MAXCPUS,
-	STARPU_OPENCL_DEFAULT = STARPU_CUDA_DEFAULT + STARPU_MAXCUDADEVS
+	STARPU_OPENCL_DEFAULT = STARPU_CUDA_DEFAULT + STARPU_MAXCUDADEVS,
 	/* STARPU_OPENCL_DEFAULT + devid */
 	/* STARPU_OPENCL_DEFAULT + devid */
+	STARPU_MIC_DEFAULT = STARPU_OPENCL_DEFAULT + STARPU_MAXOPENCLDEVS,
+	STARPU_SCC_DEFAULT = STARPU_MIC_DEFAULT + STARPU_MAXMICDEVS
 };
 };
 
 
 #ifdef __STDC_VERSION__
 #ifdef __STDC_VERSION__
@@ -54,15 +56,19 @@ enum starpu_perfmodel_archtype
 
 
 _Static_assert(STARPU_CPU_DEFAULT == 0,
 _Static_assert(STARPU_CPU_DEFAULT == 0,
 	       "invalid STARPU_CPU_DEFAULT value");
 	       "invalid STARPU_CPU_DEFAULT value");
-_Static_assert(STARPU_CUDA_DEFAULT > STARPU_CPU_DEFAULT,
-	       "invalid STARPU_CPU_DEFAULT value");
+_Static_assert(STARPU_CPU_DEFAULT < STARPU_CUDA_DEFAULT,
+	       "invalid STARPU_{CPU,CUDA}_DEFAULT values");
 _Static_assert(STARPU_CUDA_DEFAULT < STARPU_OPENCL_DEFAULT,
 _Static_assert(STARPU_CUDA_DEFAULT < STARPU_OPENCL_DEFAULT,
 	       "invalid STARPU_{CUDA,OPENCL}_DEFAULT values");
 	       "invalid STARPU_{CUDA,OPENCL}_DEFAULT values");
+_Static_assert(STARPU_OPENCL_DEFAULT < STARPU_MIC_DEFAULT,
+	       "invalid STARPU_{OPENCL,MIC}_DEFAULT values");
+_Static_assert(STARPU_MIC_DEFAULT < STARPU_SCC_DEFAULT,
+	       "invalid STARPU_{MIC,SCC}_DEFAULT values");
 
 
 #  endif
 #  endif
 #endif
 #endif
 
 
-#define STARPU_NARCH_VARIATIONS	(STARPU_OPENCL_DEFAULT + STARPU_MAXOPENCLDEVS)
+#define STARPU_NARCH_VARIATIONS	(STARPU_MIC_DEFAULT + STARPU_MAXMICDEVS)
 
 
 struct starpu_perfmodel_history_entry
 struct starpu_perfmodel_history_entry
 {
 {

+ 35 - 0
include/starpu_scc.h

@@ -0,0 +1,35 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012  Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+
+#ifndef __STARPU_SCC_H__
+#define __STARPU_SCC_H__
+
+#include <starpu_config.h>
+
+
+#ifdef STARPU_USE_SCC
+
+typedef void *starpu_scc_func_symbol_t;
+
+int starpu_scc_register_kernel(starpu_scc_func_symbol_t *symbol, const char *func_name);
+
+starpu_scc_kernel_t starpu_scc_get_kernel(starpu_scc_func_symbol_t symbol);
+
+#endif /* STARPU_USE_SCC */
+
+
+#endif /* __STARPU_SCC_H__ */

+ 1 - 1
include/starpu_sched_ctx.h

@@ -114,7 +114,7 @@ void* starpu_sched_ctx_get_policy_data(unsigned sched_ctx_id);
  * WORKERS IN CONTEXT 
  * WORKERS IN CONTEXT 
 */
 */
 /* create a worker collection for a context, the type can be only STARPU_WORKER_LIST for now, which corresponds to a simple list */
 /* create a worker collection for a context, the type can be only STARPU_WORKER_LIST for now, which corresponds to a simple list */
-struct starpu_worker_collection* starpu_sched_ctx_create_worker_collection(unsigned sched_ctx_id, int type);
+struct starpu_worker_collection* starpu_sched_ctx_create_worker_collection(unsigned sched_ctx_id, enum starpu_worker_collection_type type);
 
 
 /* free the worker collection when removing the context */
 /* free the worker collection when removing the context */
 void starpu_sched_ctx_delete_worker_collection(unsigned sched_ctx_id);
 void starpu_sched_ctx_delete_worker_collection(unsigned sched_ctx_id);

+ 23 - 0
include/starpu_sink.h

@@ -0,0 +1,23 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012  Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+
+#ifndef __STARPU_SINK_H__
+#define __STARPU_SINK_H__
+
+void starpu_sink_common_worker(int argc, char **argv);
+
+#endif /* __STARPU_SINK_H__ */

+ 11 - 0
include/starpu_task.h

@@ -37,6 +37,8 @@ extern "C"
 #define STARPU_CPU	((1ULL)<<1)
 #define STARPU_CPU	((1ULL)<<1)
 #define STARPU_CUDA	((1ULL)<<3)
 #define STARPU_CUDA	((1ULL)<<3)
 #define STARPU_OPENCL	((1ULL)<<6)
 #define STARPU_OPENCL	((1ULL)<<6)
+#define STARPU_MIC	((1ULL)<<7)
+#define STARPU_SCC	((1ULL)<<8)
 
 
 /* Codelet types */
 /* Codelet types */
 enum starpu_codelet_type
 enum starpu_codelet_type
@@ -65,6 +67,11 @@ typedef uint64_t starpu_tag_t;
 typedef void (*starpu_cpu_func_t)(void **, void*);    /* CPU core */
 typedef void (*starpu_cpu_func_t)(void **, void*);    /* CPU core */
 typedef void (*starpu_cuda_func_t)(void **, void*);   /* NVIDIA CUDA device */
 typedef void (*starpu_cuda_func_t)(void **, void*);   /* NVIDIA CUDA device */
 typedef void (*starpu_opencl_func_t)(void **, void*); /* OpenCL CUDA device */
 typedef void (*starpu_opencl_func_t)(void **, void*); /* OpenCL CUDA device */
+typedef void (*starpu_mic_kernel_t)(void **, void*); /* MIC device */
+typedef void (*starpu_scc_kernel_t)(void **, void*); /* SCC device */
+
+typedef starpu_mic_kernel_t (*starpu_mic_func_t)(void);
+typedef starpu_scc_kernel_t (*starpu_scc_func_t)(void);
 
 
 #define STARPU_MULTIPLE_CPU_IMPLEMENTATIONS    ((starpu_cpu_func_t) -1)
 #define STARPU_MULTIPLE_CPU_IMPLEMENTATIONS    ((starpu_cpu_func_t) -1)
 #define STARPU_MULTIPLE_CUDA_IMPLEMENTATIONS   ((starpu_cuda_func_t) -1)
 #define STARPU_MULTIPLE_CUDA_IMPLEMENTATIONS   ((starpu_cuda_func_t) -1)
@@ -91,6 +98,10 @@ struct starpu_codelet
 	starpu_cpu_func_t cpu_funcs[STARPU_MAXIMPLEMENTATIONS];
 	starpu_cpu_func_t cpu_funcs[STARPU_MAXIMPLEMENTATIONS];
 	starpu_cuda_func_t cuda_funcs[STARPU_MAXIMPLEMENTATIONS];
 	starpu_cuda_func_t cuda_funcs[STARPU_MAXIMPLEMENTATIONS];
 	starpu_opencl_func_t opencl_funcs[STARPU_MAXIMPLEMENTATIONS];
 	starpu_opencl_func_t opencl_funcs[STARPU_MAXIMPLEMENTATIONS];
+	starpu_mic_func_t mic_funcs[STARPU_MAXIMPLEMENTATIONS];
+	starpu_scc_func_t scc_funcs[STARPU_MAXIMPLEMENTATIONS];
+
+	char *cpu_funcs_name[STARPU_MAXIMPLEMENTATIONS];
 
 
 	/* how many buffers do the codelet takes as argument ? */
 	/* how many buffers do the codelet takes as argument ? */
 	unsigned nbuffers;
 	unsigned nbuffers;

+ 12 - 12
include/starpu_task_util.h

@@ -35,18 +35,18 @@ void starpu_create_sync_task(starpu_tag_t sync_tag, unsigned ndeps, starpu_tag_t
 				void (*callback)(void *), void *callback_arg);
 				void (*callback)(void *), void *callback_arg);
 
 
 /* Constants used by the starpu_insert_task helper to determine the different types of argument */
 /* Constants used by the starpu_insert_task helper to determine the different types of argument */
-#define STARPU_VALUE		(1<<4)	/* Pointer to a constant value */
-#define STARPU_CALLBACK		(1<<5)	/* Callback function */
-#define STARPU_CALLBACK_WITH_ARG	(1<<6)	/* Callback function */
-#define STARPU_CALLBACK_ARG	(1<<7)	/* Argument of the callback function (of type void *) */
-#define STARPU_PRIORITY		(1<<8)	/* Priority associated to the task */
-#define STARPU_EXECUTE_ON_NODE	(1<<9)	/* Used by MPI to define which task is going to execute the codelet */
-#define STARPU_EXECUTE_ON_DATA	(1<<10)	/* Used by MPI to define which task is going to execute the codelet */
-#define STARPU_DATA_ARRAY       (1<<11) /* Array of data handles */
-#define STARPU_TAG              (1<<12) /* Tag */
-#define STARPU_HYPERVISOR_TAG	(1<<13)	/* Used to tag a task after whose execution we'll execute  a code */
-#define STARPU_FLOPS	        (1<<14)	/* Used to specify the number of flops needed to be executed by a task */
-#define STARPU_SCHED_CTX	(1<<15)	/* Used to specify the sched_ctx to which the task will be submitted */
+#define STARPU_VALUE		(1<<19)	/* Pointer to a constant value */
+#define STARPU_CALLBACK		(1<<20)	/* Callback function */
+#define STARPU_CALLBACK_WITH_ARG	(1<<21)	/* Callback function */
+#define STARPU_CALLBACK_ARG	(1<<22)	/* Argument of the callback function (of type void *) */
+#define STARPU_PRIORITY		(1<<23)	/* Priority associated to the task */
+#define STARPU_EXECUTE_ON_NODE	(1<<24)	/* Used by MPI to define which task is going to execute the codelet */
+#define STARPU_EXECUTE_ON_DATA	(1<<25)	/* Used by MPI to define which task is going to execute the codelet */
+#define STARPU_DATA_ARRAY       (1<<26) /* Array of data handles */
+#define STARPU_TAG              (1<<27) /* Tag */
+#define STARPU_HYPERVISOR_TAG	(1<<28)	/* Used to tag a task after whose execution we'll execute  a code */
+#define STARPU_FLOPS	        (1<<29)	/* Used to specify the number of flops needed to be executed by a task */
+#define STARPU_SCHED_CTX	(1<<30)	/* Used to specify the sched_ctx to which the task will be submitted */
 
 
 /* Wrapper to create a task. */
 /* Wrapper to create a task. */
 int starpu_insert_task(struct starpu_codelet *cl, ...);
 int starpu_insert_task(struct starpu_codelet *cl, ...);

+ 1 - 1
include/starpu_util.h

@@ -177,7 +177,7 @@ STARPU_ATOMIC_SOMETHING(or, old | value)
 #define STARPU_SYNCHRONIZE() __asm__ __volatile__("sync" ::: "memory")
 #define STARPU_SYNCHRONIZE() __asm__ __volatile__("sync" ::: "memory")
 #endif
 #endif
 
 
-#if defined(__i386__)
+#if defined(__i386__) || defined(__KNC__) || defined(__KNF__)
 #define STARPU_RMB() __asm__ __volatile__("lock; addl $0,0(%%esp)" ::: "memory")
 #define STARPU_RMB() __asm__ __volatile__("lock; addl $0,0(%%esp)" ::: "memory")
 #define STARPU_WMB() __asm__ __volatile__("lock; addl $0,0(%%esp)" ::: "memory")
 #define STARPU_WMB() __asm__ __volatile__("lock; addl $0,0(%%esp)" ::: "memory")
 #elif defined(__x86_64__)
 #elif defined(__x86_64__)

+ 41 - 6
include/starpu_worker.h

@@ -36,7 +36,17 @@ enum starpu_worker_archtype
 	STARPU_ANY_WORKER,    /* any worker, used in the hypervisor */
 	STARPU_ANY_WORKER,    /* any worker, used in the hypervisor */
 	STARPU_CPU_WORKER,    /* CPU core */
 	STARPU_CPU_WORKER,    /* CPU core */
 	STARPU_CUDA_WORKER,   /* NVIDIA CUDA device */
 	STARPU_CUDA_WORKER,   /* NVIDIA CUDA device */
-	STARPU_OPENCL_WORKER  /* OpenCL device */
+	STARPU_OPENCL_WORKER, /* OpenCL device */
+	STARPU_MIC_WORKER,    /* Intel MIC device */
+	STARPU_SCC_WORKER     /* Intel SCC device */
+};
+
+/* Represent the topology of sink devices, contains useful informations about
+ * their capabilities */
+// XXX: unused.
+struct starpu_sink_topology
+{
+	unsigned nb_cpus;
 };
 };
 
 
 struct starpu_sched_ctx_iterator
 struct starpu_sched_ctx_iterator
@@ -61,10 +71,20 @@ struct starpu_machine_topology
 	unsigned nhwcpus;
 	unsigned nhwcpus;
 	unsigned nhwcudagpus;
 	unsigned nhwcudagpus;
 	unsigned nhwopenclgpus;
 	unsigned nhwopenclgpus;
+	unsigned nhwscc;
 
 
 	unsigned ncpus;
 	unsigned ncpus;
 	unsigned ncudagpus;
 	unsigned ncudagpus;
 	unsigned nopenclgpus;
 	unsigned nopenclgpus;
+	unsigned nsccdevices;
+
+	/* Topology of MP nodes (mainly MIC and SCC) as well as necessary
+	 * objects to communicate with them. */
+	unsigned nhwmicdevices;
+	unsigned nmicdevices;
+
+	unsigned nhwmiccores[STARPU_MAXMICDEVS]; // Each MIC node has its set of cores.
+	unsigned nmiccores[STARPU_MAXMICDEVS];
 
 
 	/* Where to bind workers ? */
 	/* Where to bind workers ? */
 	unsigned workers_bindid[STARPU_NMAXWORKERS];
 	unsigned workers_bindid[STARPU_NMAXWORKERS];
@@ -74,6 +94,18 @@ struct starpu_machine_topology
 
 
 	/* Which GPU(s) do we use for OpenCL ? */
 	/* Which GPU(s) do we use for OpenCL ? */
 	unsigned workers_opencl_gpuid[STARPU_NMAXWORKERS];
 	unsigned workers_opencl_gpuid[STARPU_NMAXWORKERS];
+
+	/* Which MIC core(s) do we use ? */
+	/* unsigned workers_mic_deviceid[STARPU_NMAXWORKERS]; */
+
+	/* Which SCC(s) do we use ? */
+	unsigned workers_scc_deviceid[STARPU_NMAXWORKERS];
+};
+
+/* types of structures the worker collection can implement */
+enum starpu_worker_collection_type
+{
+	STARPU_WORKER_LIST
 };
 };
 
 
 /* generic structure used by the scheduling contexts to iterate the workers */
 /* generic structure used by the scheduling contexts to iterate the workers */
@@ -83,8 +115,8 @@ struct starpu_worker_collection
 	void *workerids;
 	void *workerids;
 	/* the number of workers in the collection */
 	/* the number of workers in the collection */
 	unsigned nworkers;
 	unsigned nworkers;
-	/* the type of structure (STARPU_WORKER_LIST,...) */
-	int type;
+	/* the type of structure */
+	enum starpu_worker_collection_type type;
 	/* checks if there is another element in collection */
 	/* checks if there is another element in collection */
 	unsigned (*has_next)(struct starpu_worker_collection *workers, struct starpu_sched_ctx_iterator *it);
 	unsigned (*has_next)(struct starpu_worker_collection *workers, struct starpu_sched_ctx_iterator *it);
 	/* return the next element in the collection */
 	/* return the next element in the collection */
@@ -101,9 +133,6 @@ struct starpu_worker_collection
 	void (*init_iterator)(struct starpu_worker_collection *workers, struct starpu_sched_ctx_iterator *it);
 	void (*init_iterator)(struct starpu_worker_collection *workers, struct starpu_sched_ctx_iterator *it);
 };
 };
 
 
-/* types of structures the worker collection can implement */
-#define STARPU_WORKER_LIST 0
-
 /* This function returns the number of workers (ie. processing units executing
 /* This function returns the number of workers (ie. processing units executing
  * StarPU tasks). The returned value should be at most STARPU_NMAXWORKERS. */
  * StarPU tasks). The returned value should be at most STARPU_NMAXWORKERS. */
 unsigned starpu_worker_get_count(void);
 unsigned starpu_worker_get_count(void);
@@ -113,6 +142,10 @@ unsigned starpu_worker_is_combined_worker(int id);
 unsigned starpu_cpu_worker_get_count(void);
 unsigned starpu_cpu_worker_get_count(void);
 unsigned starpu_cuda_worker_get_count(void);
 unsigned starpu_cuda_worker_get_count(void);
 unsigned starpu_opencl_worker_get_count(void);
 unsigned starpu_opencl_worker_get_count(void);
+unsigned starpu_mic_worker_get_count(void);
+unsigned starpu_scc_worker_get_count(void);
+
+unsigned starpu_mic_device_get_count(void);
 
 
 /* Return the identifier of the thread in case this is associated to a worker.
 /* Return the identifier of the thread in case this is associated to a worker.
  * This will return -1 if this function is called directly from the application
  * This will return -1 if this function is called directly from the application
@@ -166,6 +199,8 @@ void starpu_worker_get_name(int id, char *dst, size_t maxlen);
  */
  */
 int starpu_worker_get_devid(int id);
 int starpu_worker_get_devid(int id);
 
 
+int starpu_worker_get_mp_nodeid(int id);
+
 #ifdef __cplusplus
 #ifdef __cplusplus
 }
 }
 #endif
 #endif

+ 28 - 0
libstarpu-mic.pc.in

@@ -0,0 +1,28 @@
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2009-2011, 2013  Université de Bordeaux 1
+# Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+
+prefix=@prefix@
+exec_prefix=@exec_prefix@
+libdir=@libdir@
+includedir=@includedir@
+
+Name: starpu
+Description: offers support for heterogeneous multicore architecture
+Version: @PACKAGE_VERSION@
+Cflags: -I${includedir}/starpu/@STARPU_EFFECTIVE_VERSION@ @STARPU_CUDA_CPPFLAGS@ -DSTARPU_USE_DEPRECATED_API
+Libs: @STARPU_EXPORT_DYNAMIC@ -L${libdir} -lstarpu-@STARPU_EFFECTIVE_VERSION@ @STARPU_CUDA_LDFLAGS@ @STARPU_OPENCL_LDFLAGS@ @STARPU_SC_HYPERVISOR@
+Libs.private: @LDFLAGS@ @LIBS@
+Requires: @HWLOC_REQUIRES@

+ 1 - 1
libstarpu.pc.in

@@ -23,6 +23,6 @@ Name: starpu
 Description: offers support for heterogeneous multicore architecture
 Description: offers support for heterogeneous multicore architecture
 Version: @PACKAGE_VERSION@
 Version: @PACKAGE_VERSION@
 Cflags: -I${includedir}/starpu/@STARPU_EFFECTIVE_VERSION@ @STARPU_CUDA_CPPFLAGS@ -DSTARPU_USE_DEPRECATED_API
 Cflags: -I${includedir}/starpu/@STARPU_EFFECTIVE_VERSION@ @STARPU_CUDA_CPPFLAGS@ -DSTARPU_USE_DEPRECATED_API
-Libs: -L${libdir} -lstarpu-@STARPU_EFFECTIVE_VERSION@ @STARPU_CUDA_LDFLAGS@ @STARPU_OPENCL_LDFLAGS@ @STARPU_SC_HYPERVISOR@
+Libs: @STARPU_EXPORT_DYNAMIC@ -L${libdir} -lstarpu-@STARPU_EFFECTIVE_VERSION@ @STARPU_CUDA_LDFLAGS@ @STARPU_OPENCL_LDFLAGS@ @STARPU_SC_HYPERVISOR@
 Libs.private: @LDFLAGS@ @LIBS@
 Libs.private: @LDFLAGS@ @LIBS@
 Requires: @HWLOC_REQUIRES@
 Requires: @HWLOC_REQUIRES@

+ 90 - 0
mic-configure

@@ -0,0 +1,90 @@
+#!/bin/bash
+
+ROOT_DIR=$PWD
+[ -n "$MIC_HOST" ] || MIC_HOST=x86_64-k1om-linux
+[ -n "$MIC_CC_PATH" ] || MIC_CC_PATH=/usr/linux-k1om-4.7/bin/
+[ -n "$COI_DIR" ] || COI_DIR=/opt/intel/mic/coi
+DEFAULT_PREFIX=/usr/local
+
+export PATH=${MIC_CC_PATH}${PATH:+:${PATH}}
+
+cat > ./mic-config.log << EOF
+This file was created by StarPU mic-configure
+
+ $ $0 $*
+EOF
+
+for arch in mic host
+do
+
+	# We call the configure script from a build directory further in the
+	# arborescence
+	command="${ROOT_DIR}/configure --enable-mic --with-coi-dir=$COI_DIR"
+	prefix_found=no
+
+	if test x$arch = xmic ; then
+		command="$command --without-hwloc --with-coi-lib-dir=$COI_DIR/device-linux-release/lib --host=$MIC_HOST"
+	else
+		command="$command --with-coi-lib-dir=$COI_DIR/host-linux-release/lib"
+	fi
+
+	for arg in $*
+	do
+		if [ ${arg:0:9} = '--prefix=' ]
+		then
+			prefix_found=yes
+			prefix="${arg:9}"
+			command="$command ${arg}/${arch}"
+		else
+			command="$command $arg"
+		fi
+
+	done
+
+	# If the user didn't specify a directory where to install the library
+	# we apply the default one
+	if test x$prefix_found = xno ; then
+		command="$command --prefix=${DEFAULT_PREFIX}/$arch"
+		prefix=${DEFAULT_PREFIX}
+	fi
+
+	# If the build directory doesn't exist yet, create it
+	if [ ! -d "${ROOT_DIR}/build_${arch}" ] ; then
+		mkdir "build_${arch}"
+	fi
+
+	cd "build_${arch}"
+
+	if test x$arch = xmic ; then
+		LDFLAGS=-export-dynamic $command
+	else
+		$command
+	fi
+	if [ "$?" != 0 ]
+	then
+		exit $?
+	fi
+	cd "${ROOT_DIR}"
+done
+
+cat > Makefile << EOF
+all:
+	\$(MAKE) -C build_host
+	\$(MAKE) -C build_mic
+
+clean:
+	\$(MAKE) -C build_host clean
+	\$(MAKE) -C build_mic clean
+
+distclean: clean
+	rm -f Makefile
+
+check:
+	\$(MAKE) -C build_host check
+	\$(MAKE) -C build_mic check
+
+install:
+	\$(MAKE) -C build_host install
+	\$(MAKE) -C build_mic install
+	ln -sf "${prefix}/mic/lib/pkgconfig/starpu-1.2.pc" "${prefix}/mic/lib/pkgconfig/starpu-1.2-mic.pc"
+EOF

+ 47 - 3
src/Makefile.am

@@ -49,10 +49,10 @@ endif STARPU_HAVE_WINDOWS
 
 
 lib_LTLIBRARIES = libstarpu-@STARPU_EFFECTIVE_VERSION@.la
 lib_LTLIBRARIES = libstarpu-@STARPU_EFFECTIVE_VERSION@.la
 
 
-libstarpu_@STARPU_EFFECTIVE_VERSION@_la_CPPFLAGS = -I$(top_srcdir)/include/ -DBUILDING_STARPU
+libstarpu_@STARPU_EFFECTIVE_VERSION@_la_CPPFLAGS = -I$(top_srcdir)/include/ $(STARPU_RCCE_CPPFLAGS) -DBUILDING_STARPU
 
 
-libstarpu_@STARPU_EFFECTIVE_VERSION@_la_CFLAGS = $(GLOBAL_AM_CFLAGS) $(HWLOC_CFLAGS) $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(FXT_CFLAGS)
-libstarpu_@STARPU_EFFECTIVE_VERSION@_la_LIBADD = -lm $(HWLOC_LIBS) $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(FXT_LIBS) $(STARPU_GLPK_LDFLAGS)
+libstarpu_@STARPU_EFFECTIVE_VERSION@_la_CFLAGS = $(GLOBAL_AM_CFLAGS) $(HWLOC_CFLAGS) $(STARPU_CUDA_CPPFLAGS) $(STARPU_OPENCL_CPPFLAGS) $(STARPU_COI_CPPFLAGS) $(STARPU_RCCE_CFLAGS) $(FXT_CFLAGS)
+libstarpu_@STARPU_EFFECTIVE_VERSION@_la_LIBADD = -lm $(HWLOC_LIBS) $(STARPU_OPENCL_LDFLAGS) $(STARPU_CUDA_LDFLAGS) $(STARPU_COI_LDFLAGS) $(STARPU_RCCE_LDFLAGS) $(FXT_LIBS) $(STARPU_GLPK_LDFLAGS)
 libstarpu_@STARPU_EFFECTIVE_VERSION@_la_LDFLAGS = $(ldflags) $(FXT_LDFLAGS) -no-undefined									\
 libstarpu_@STARPU_EFFECTIVE_VERSION@_la_LDFLAGS = $(ldflags) $(FXT_LDFLAGS) -no-undefined									\
   -version-info $(libstarpu_so_version)
   -version-info $(libstarpu_so_version)
 
 
@@ -105,11 +105,20 @@ noinst_HEADERS = 						\
 	common/uthash.h						\
 	common/uthash.h						\
 	common/barrier_counter.h				\
 	common/barrier_counter.h				\
 	drivers/driver_common/driver_common.h			\
 	drivers/driver_common/driver_common.h			\
+	drivers/mp_common/mp_common.h				\
+	drivers/mp_common/source_common.h			\
+	drivers/mp_common/sink_common.h				\
 	drivers/cpu/driver_cpu.h				\
 	drivers/cpu/driver_cpu.h				\
 	drivers/cuda/driver_cuda.h				\
 	drivers/cuda/driver_cuda.h				\
 	drivers/opencl/driver_opencl.h				\
 	drivers/opencl/driver_opencl.h				\
 	drivers/opencl/driver_opencl_utils.h			\
 	drivers/opencl/driver_opencl_utils.h			\
 	debug/starpu_debug_helpers.h				\
 	debug/starpu_debug_helpers.h				\
+	drivers/mic/driver_mic_common.h				\
+	drivers/mic/driver_mic_source.h				\
+	drivers/mic/driver_mic_sink.h				\
+	drivers/scc/driver_scc_common.h				\
+	drivers/scc/driver_scc_source.h				\
+	drivers/scc/driver_scc_sink.h				\
 	debug/traces/starpu_fxt.h				\
 	debug/traces/starpu_fxt.h				\
 	profiling/bound.h					\
 	profiling/bound.h					\
 	profiling/profiling.h					\
 	profiling/profiling.h					\
@@ -256,5 +265,40 @@ libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/opencl/driver_opencl.
 endif
 endif
 endif
 endif
 
 
+if STARPU_USE_SCC
+libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/scc/driver_scc_common.c
+libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/scc/driver_scc_source.c
+libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/scc/driver_scc_sink.c
+libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/scc/driver_scc_utils.c
+endif
+
+
+#########################################
+#										#
+#        Generic MP compilation			#
+#										#
+#########################################
+
+if STARPU_USE_MP
+libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/mp_common/mp_common.c
+libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/mp_common/source_common.c
+libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/mp_common/sink_common.c
+endif
+
+#########################################
+#										#
+#	     MIC compilation				#
+#										#
+#########################################
+
+if STARPU_USE_MIC
+libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/mic/driver_mic_common.c
+libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/mic/driver_mic_source.c
+libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/mic/driver_mic_sink.c
+libstarpu_@STARPU_EFFECTIVE_VERSION@_la_SOURCES += drivers/mic/driver_mic_utils.c
+endif
+
+#########################################
+
 showcheck:
 showcheck:
 	-cat /dev/null
 	-cat /dev/null

+ 2 - 0
src/common/fxt.h

@@ -37,6 +37,8 @@
 #define _STARPU_FUT_CPU_KEY	0x101
 #define _STARPU_FUT_CPU_KEY	0x101
 #define _STARPU_FUT_CUDA_KEY	0x102
 #define _STARPU_FUT_CUDA_KEY	0x102
 #define _STARPU_FUT_OPENCL_KEY	0x103
 #define _STARPU_FUT_OPENCL_KEY	0x103
+#define _STARPU_FUT_MIC_KEY	0x104
+#define _STARPU_FUT_SCC_KEY	0x105
 
 
 #define _STARPU_FUT_WORKER_INIT_START	0x5100
 #define _STARPU_FUT_WORKER_INIT_START	0x5100
 #define _STARPU_FUT_WORKER_INIT_END	0x5101
 #define _STARPU_FUT_WORKER_INIT_END	0x5101

+ 8 - 2
src/common/utils.c

@@ -130,8 +130,14 @@ char *_starpu_get_home_path(void)
 		path = getenv("HOME");
 		path = getenv("HOME");
 	if (!path)
 	if (!path)
 		path = getenv("USERPROFILE");
 		path = getenv("USERPROFILE");
-	if (!path)
-		_STARPU_ERROR("couldn't find a home place to put starpu data\n");
+	if (!path) {
+		static int warn;
+		if (!warn) {
+			warn = 1;
+			_STARPU_DISP("couldn't find a home place to put starpu data, using /tmp\n");
+		}
+		path = "/tmp";
+	}
 	return path;
 	return path;
 }
 }
 
 

+ 11 - 5
src/common/utils.h

@@ -74,23 +74,29 @@
 	DO_CREQ_v_W(_VG_USERREQ__HG_PTHREAD_SPIN_INIT_OR_UNLOCK_POST, \
 	DO_CREQ_v_W(_VG_USERREQ__HG_PTHREAD_SPIN_INIT_OR_UNLOCK_POST, \
 			struct _starpu_spinlock *, lock)
 			struct _starpu_spinlock *, lock)
 
 
+#if defined(__KNC__) || defined(__KNF__)
+#define STARPU_DEBUG_PREFIX "[starpu-mic]"
+#else
+#define STARPU_DEBUG_PREFIX "[starpu]"
+#endif
+
 #ifdef STARPU_VERBOSE
 #ifdef STARPU_VERBOSE
-#  define _STARPU_DEBUG(fmt, args ...) do { if (!getenv("STARPU_SILENT")) {fprintf(stderr, "[starpu][%s] " fmt ,__starpu_func__ ,##args); fflush(stderr); }} while(0)
+#  define _STARPU_DEBUG(fmt, args ...) do { if (!getenv("STARPU_SILENT")) {fprintf(stderr, STARPU_DEBUG_PREFIX"[%s] " fmt ,__starpu_func__ ,##args); fflush(stderr); }} while(0)
 #else
 #else
 #  define _STARPU_DEBUG(fmt, args ...) do { } while (0)
 #  define _STARPU_DEBUG(fmt, args ...) do { } while (0)
 #endif
 #endif
 
 
 #ifdef STARPU_VERBOSE0
 #ifdef STARPU_VERBOSE0
-#  define _STARPU_LOG_IN()             do { if (!getenv("STARPU_SILENT")) {fprintf(stderr, "[starpu][%ld][%s] -->\n", pthread_self(), __starpu_func__ ); }} while(0)
-#  define _STARPU_LOG_OUT()            do { if (!getenv("STARPU_SILENT")) {fprintf(stderr, "[starpu][%ld][%s] <--\n", pthread_self(), __starpu_func__ ); }} while(0)
-#  define _STARPU_LOG_OUT_TAG(outtag)  do { if (!getenv("STARPU_SILENT")) {fprintf(stderr, "[starpu][%ld][%s] <-- (%s)\n", pthread_self(), __starpu_func__, outtag); }} while(0)
+#  define _STARPU_LOG_IN()             do { if (!getenv("STARPU_SILENT")) {fprintf(stderr, STARPU_DEBUG_PREFIX"[%ld][%s] -->\n", pthread_self(), __starpu_func__ ); }} while(0)
+#  define _STARPU_LOG_OUT()            do { if (!getenv("STARPU_SILENT")) {fprintf(stderr, STARPU_DEBUG_PREFIX"[%ld][%s] <--\n", pthread_self(), __starpu_func__ ); }} while(0)
+#  define _STARPU_LOG_OUT_TAG(outtag)  do { if (!getenv("STARPU_SILENT")) {fprintf(stderr, STARPU_DEBUG_PREFIX"[%ld][%s] <-- (%s)\n", pthread_self(), __starpu_func__, outtag); }} while(0)
 #else
 #else
 #  define _STARPU_LOG_IN()
 #  define _STARPU_LOG_IN()
 #  define _STARPU_LOG_OUT()
 #  define _STARPU_LOG_OUT()
 #  define _STARPU_LOG_OUT_TAG(outtag)
 #  define _STARPU_LOG_OUT_TAG(outtag)
 #endif
 #endif
 
 
-#define _STARPU_DISP(fmt, args ...) do { if (!getenv("STARPU_SILENT")) {fprintf(stderr, "[starpu][%s] " fmt ,__starpu_func__ ,##args); }} while(0)
+#define _STARPU_DISP(fmt, args ...) do { if (!getenv("STARPU_SILENT")) {fprintf(stderr, STARPU_DEBUG_PREFIX"[%s] " fmt ,__starpu_func__ ,##args); }} while(0)
 #define _STARPU_ERROR(fmt, args ...)                                                  \
 #define _STARPU_ERROR(fmt, args ...)                                                  \
 	do {                                                                          \
 	do {                                                                          \
                 fprintf(stderr, "\n\n[starpu][%s] Error: " fmt ,__starpu_func__ ,##args);    \
                 fprintf(stderr, "\n\n[starpu][%s] Error: " fmt ,__starpu_func__ ,##args);    \

+ 3 - 2
src/core/dependencies/data_concurrency.c

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2010-2012  Université de Bordeaux 1
+ * Copyright (C) 2010-2013  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -55,6 +55,7 @@ static struct _starpu_data_requester *may_unlock_data_req_list_head(starpu_data_
 	if (handle->refcnt == 0)
 	if (handle->refcnt == 0)
 		return _starpu_data_requester_list_pop_front(req_list);
 		return _starpu_data_requester_list_pop_front(req_list);
 
 
+	/* Already writing to it, do not let another write access through */
 	if (handle->current_mode == STARPU_W)
 	if (handle->current_mode == STARPU_W)
 		return NULL;
 		return NULL;
 
 
@@ -193,7 +194,7 @@ static unsigned attempt_to_submit_data_request_from_job(struct _starpu_job *j, u
 	/* Note that we do not access j->task->handles, but j->ordered_buffers
 	/* Note that we do not access j->task->handles, but j->ordered_buffers
 	 * which is a sorted copy of it. */
 	 * which is a sorted copy of it. */
 	starpu_data_handle_t handle = _STARPU_JOB_GET_ORDERED_BUFFER_HANDLE(j, buffer_index);
 	starpu_data_handle_t handle = _STARPU_JOB_GET_ORDERED_BUFFER_HANDLE(j, buffer_index);
-	enum starpu_data_access_mode mode = _STARPU_JOB_GET_ORDERED_BUFFER_MODE(j, buffer_index);
+	enum starpu_data_access_mode mode = _STARPU_JOB_GET_ORDERED_BUFFER_MODE(j, buffer_index) & ~STARPU_COMMUTE;
 
 
 	return _starpu_attempt_to_submit_data_request(1, handle, mode, NULL, NULL, j, buffer_index);
 	return _starpu_attempt_to_submit_data_request(1, handle, mode, NULL, NULL, j, buffer_index);
 }
 }

+ 100 - 130
src/core/dependencies/implicit_data_deps.c

@@ -46,23 +46,22 @@ static void _starpu_add_dependency(starpu_data_handle_t handle STARPU_ATTRIBUTE_
 	_starpu_add_ghost_dependency(handle, _starpu_get_job_associated_to_task(previous)->job_id, next);
 	_starpu_add_ghost_dependency(handle, _starpu_get_job_associated_to_task(previous)->job_id, next);
 }
 }
 
 
-/* Read after Write (RAW) or Read after Read (RAR) */
-static void _starpu_add_reader_after_writer(starpu_data_handle_t handle, struct starpu_task *pre_sync_task, struct starpu_task *post_sync_task)
+/* Add pre_sync_task as new accessor among the existing ones, making it depend on the last synchronization task if any.  */
+static void _starpu_add_accessor(starpu_data_handle_t handle, struct starpu_task *pre_sync_task, struct starpu_task *post_sync_task)
 {
 {
 	/* Add this task to the list of readers */
 	/* Add this task to the list of readers */
 	struct _starpu_task_wrapper_list *link = (struct _starpu_task_wrapper_list *) malloc(sizeof(struct _starpu_task_wrapper_list));
 	struct _starpu_task_wrapper_list *link = (struct _starpu_task_wrapper_list *) malloc(sizeof(struct _starpu_task_wrapper_list));
 	link->task = post_sync_task;
 	link->task = post_sync_task;
-	link->next = handle->last_submitted_readers;
-	handle->last_submitted_readers = link;
+	link->next = handle->last_submitted_accessors;
+	handle->last_submitted_accessors = link;
 
 
-	/* This task depends on the previous writer if any */
-	if (handle->last_submitted_writer && handle->last_submitted_writer != post_sync_task)
+	/* This task depends on the previous synchronization task if any */
+	if (handle->last_sync_task && handle->last_sync_task != post_sync_task)
 	{
 	{
-		_STARPU_DEP_DEBUG("RAW %p\n", handle);
-		struct starpu_task *task_array[1] = {handle->last_submitted_writer};
+		struct starpu_task *task_array[1] = {handle->last_sync_task};
 		_starpu_task_declare_deps_array(pre_sync_task, 1, task_array, 0);
 		_starpu_task_declare_deps_array(pre_sync_task, 1, task_array, 0);
-		_starpu_add_dependency(handle, handle->last_submitted_writer, pre_sync_task);
-		_STARPU_DEP_DEBUG("dep %p -> %p\n", handle->last_submitted_writer, pre_sync_task);
+		_starpu_add_dependency(handle, handle->last_sync_task, pre_sync_task);
+		_STARPU_DEP_DEBUG("dep %p -> %p\n", handle->last_sync_task, pre_sync_task);
 	}
 	}
         else
         else
         {
         {
@@ -82,12 +81,12 @@ static void _starpu_add_reader_after_writer(starpu_data_handle_t handle, struct
 #ifdef HAVE_AYUDAME_H
 #ifdef HAVE_AYUDAME_H
 		|| AYU_event
 		|| AYU_event
 #endif
 #endif
-		) && handle->last_submitted_ghost_writer_id_is_valid)
+		) && handle->last_submitted_ghost_sync_id_is_valid)
 	{
 	{
-		_STARPU_TRACE_GHOST_TASK_DEPS(handle->last_submitted_ghost_writer_id,
+		_STARPU_TRACE_GHOST_TASK_DEPS(handle->last_submitted_ghost_sync_id,
 			_starpu_get_job_associated_to_task(pre_sync_task)->job_id);
 			_starpu_get_job_associated_to_task(pre_sync_task)->job_id);
-		_starpu_add_ghost_dependency(handle, handle->last_submitted_ghost_writer_id, pre_sync_task);
-		_STARPU_DEP_DEBUG("dep ID%lu -> %p\n", handle->last_submitted_ghost_writer_id, pre_sync_task);
+		_starpu_add_ghost_dependency(handle, handle->last_submitted_ghost_sync_id, pre_sync_task);
+		_STARPU_DEP_DEBUG("dep ID%lu -> %p\n", handle->last_submitted_ghost_sync_id, pre_sync_task);
 	}
 	}
 
 
 	if (!pre_sync_task->cl) {
 	if (!pre_sync_task->cl) {
@@ -99,27 +98,27 @@ static void _starpu_add_reader_after_writer(starpu_data_handle_t handle, struct
 	}
 	}
 }
 }
 
 
-/* Write after Read (WAR) */
-static void _starpu_add_writer_after_readers(starpu_data_handle_t handle, struct starpu_task *pre_sync_task, struct starpu_task *post_sync_task)
+/* This adds a new synchronization task which depends on all the previous accessors */
+static void _starpu_add_sync_task(starpu_data_handle_t handle, struct starpu_task *pre_sync_task, struct starpu_task *post_sync_task)
 {
 {
-	/* Count the readers */
-	unsigned nreaders = 0;
+	/* Count the existing accessors */
+	unsigned naccessors = 0;
 	struct _starpu_task_wrapper_list *l;
 	struct _starpu_task_wrapper_list *l;
-	l = handle->last_submitted_readers;
+	l = handle->last_submitted_accessors;
 	while (l)
 	while (l)
 	{
 	{
 		if (l->task != post_sync_task)
 		if (l->task != post_sync_task)
-			nreaders++;
+			naccessors++;
 		l = l->next;
 		l = l->next;
 	}
 	}
-	_STARPU_DEP_DEBUG("%d readers\n", nreaders);
+	_STARPU_DEP_DEBUG("%d accessors\n", naccessors);
 
 
-	if (nreaders > 0)
+	if (naccessors > 0)
 	{
 	{
 		/* Put all tasks in the list into task_array */
 		/* Put all tasks in the list into task_array */
-		struct starpu_task *task_array[nreaders];
+		struct starpu_task *task_array[naccessors];
 		unsigned i = 0;
 		unsigned i = 0;
-		l = handle->last_submitted_readers;
+		l = handle->last_submitted_accessors;
 		while (l)
 		while (l)
 		{
 		{
 			STARPU_ASSERT(l->task);
 			STARPU_ASSERT(l->task);
@@ -134,80 +133,31 @@ static void _starpu_add_writer_after_readers(starpu_data_handle_t handle, struct
 			l = l->next;
 			l = l->next;
 			free(prev);
 			free(prev);
 		}
 		}
-		_starpu_task_declare_deps_array(pre_sync_task, nreaders, task_array, 0);
+		_starpu_task_declare_deps_array(pre_sync_task, naccessors, task_array, 0);
 	}
 	}
 #ifndef STARPU_USE_FXT
 #ifndef STARPU_USE_FXT
 	if (_starpu_bound_recording)
 	if (_starpu_bound_recording)
 #endif
 #endif
 	{
 	{
-		/* Declare all dependencies with ghost readers */
-		struct _starpu_jobid_list *ghost_readers_id = handle->last_submitted_ghost_readers_id;
-		while (ghost_readers_id)
+		/* Declare all dependencies with ghost accessors */
+		struct _starpu_jobid_list *ghost_accessors_id = handle->last_submitted_ghost_accessors_id;
+		while (ghost_accessors_id)
 		{
 		{
-			unsigned long id = ghost_readers_id->id;
+			unsigned long id = ghost_accessors_id->id;
 			_STARPU_TRACE_GHOST_TASK_DEPS(id,
 			_STARPU_TRACE_GHOST_TASK_DEPS(id,
 				_starpu_get_job_associated_to_task(pre_sync_task)->job_id);
 				_starpu_get_job_associated_to_task(pre_sync_task)->job_id);
 			_starpu_add_ghost_dependency(handle, id, pre_sync_task);
 			_starpu_add_ghost_dependency(handle, id, pre_sync_task);
 			_STARPU_DEP_DEBUG("dep ID%lu -> %p\n", id, pre_sync_task);
 			_STARPU_DEP_DEBUG("dep ID%lu -> %p\n", id, pre_sync_task);
 
 
-			struct _starpu_jobid_list *prev = ghost_readers_id;
-			ghost_readers_id = ghost_readers_id->next;
+			struct _starpu_jobid_list *prev = ghost_accessors_id;
+			ghost_accessors_id = ghost_accessors_id->next;
 			free(prev);
 			free(prev);
 		}
 		}
-		handle->last_submitted_ghost_readers_id = NULL;
+		handle->last_submitted_ghost_accessors_id = NULL;
 	}
 	}
 
 
-	handle->last_submitted_readers = NULL;
-	handle->last_submitted_writer = post_sync_task;
-
-	if (!post_sync_task->cl) {
-		/* Add a reference to be released in _starpu_handle_job_termination */
-		_starpu_spin_lock(&handle->header_lock);
-		handle->busy_count++;
-		_starpu_spin_unlock(&handle->header_lock);
-		_starpu_get_job_associated_to_task(post_sync_task)->implicit_dep_handle = handle;
-	}
-}
-
-/* Write after Write (WAW) */
-static void _starpu_add_writer_after_writer(starpu_data_handle_t handle, struct starpu_task *pre_sync_task, struct starpu_task *post_sync_task)
-{
-	/* (Read) Write */
-	/* This task depends on the previous writer */
-	if (handle->last_submitted_writer && handle->last_submitted_writer != post_sync_task)
-	{
-		struct starpu_task *task_array[1] = {handle->last_submitted_writer};
-		_starpu_task_declare_deps_array(pre_sync_task, 1, task_array, 0);
-		_starpu_add_dependency(handle, handle->last_submitted_writer, pre_sync_task);
-		_STARPU_DEP_DEBUG("dep %p -> %p\n", handle->last_submitted_writer, pre_sync_task);
-	}
-        else
-        {
-		_STARPU_DEP_DEBUG("No dep\n");
-        }
-
-	/* If there is a ghost writer instead, we
-	 * should declare a ghost dependency here, and
-	 * invalidate the ghost value. */
-#ifndef STARPU_USE_FXT
-	if (_starpu_bound_recording)
-#endif
-	{
-		if (handle->last_submitted_ghost_writer_id_is_valid)
-		{
-			_STARPU_TRACE_GHOST_TASK_DEPS(handle->last_submitted_ghost_writer_id, 
-				_starpu_get_job_associated_to_task(pre_sync_task)->job_id);
-			_starpu_add_ghost_dependency(handle, handle->last_submitted_ghost_writer_id, pre_sync_task);
-			_STARPU_DEP_DEBUG("dep ID%lu -> %p\n", handle->last_submitted_ghost_writer_id, pre_sync_task);
-			handle->last_submitted_ghost_writer_id_is_valid = 0;
-		}
-                else
-                {
-			_STARPU_DEP_DEBUG("No dep ID\n");
-                }
-	}
-
-	handle->last_submitted_writer = post_sync_task;
+	handle->last_submitted_accessors = NULL;
+	handle->last_sync_task = post_sync_task;
 
 
 	if (!post_sync_task->cl) {
 	if (!post_sync_task->cl) {
 		/* Add a reference to be released in _starpu_handle_job_termination */
 		/* Add a reference to be released in _starpu_handle_job_termination */
@@ -245,7 +195,6 @@ struct starpu_task *_starpu_detect_implicit_data_deps_with_handle(struct starpu_
 		if (pre_sync_job->reduction_task || post_sync_job->reduction_task)
 		if (pre_sync_job->reduction_task || post_sync_job->reduction_task)
 			return NULL;
 			return NULL;
 
 
-		_STARPU_DEP_DEBUG("Tasks %p %p\n", pre_sync_task, post_sync_task);
 		/* In case we are generating the DAG, we add an implicit
 		/* In case we are generating the DAG, we add an implicit
 		 * dependency between the pre and the post sync tasks in case
 		 * dependency between the pre and the post sync tasks in case
 		 * they are not the same. */
 		 * they are not the same. */
@@ -261,54 +210,75 @@ struct starpu_task *_starpu_detect_implicit_data_deps_with_handle(struct starpu_
 
 
 		enum starpu_data_access_mode previous_mode = handle->last_submitted_mode;
 		enum starpu_data_access_mode previous_mode = handle->last_submitted_mode;
 
 
-		if (mode & STARPU_W)
+		_STARPU_DEP_DEBUG("Handle %p Tasks %p %p %x->%x\n", handle, pre_sync_task, post_sync_task, previous_mode, mode);
+
+		/*
+		 * Tasks can access the data concurrently only if they have the
+		 * same access mode, which can only be either:
+		 * - write with STARPU_COMMUTE
+		 * - read
+		 * - redux
+		 *
+		 * In other cases, the tasks have to depend on each other.
+		 */
+
+		if ((mode & STARPU_W && mode & STARPU_COMMUTE && previous_mode & STARPU_W && previous_mode && STARPU_COMMUTE)
+		  || (mode == STARPU_R && previous_mode == STARPU_R)
+		  || (mode == STARPU_REDUX && previous_mode == STARPU_REDUX))
 		{
 		{
-			_STARPU_DEP_DEBUG("W %p\n", handle);
-			if (previous_mode & STARPU_W)
-			{
-				_STARPU_DEP_DEBUG("WAW %p\n", handle);
-				_starpu_add_writer_after_writer(handle, pre_sync_task, post_sync_task);
-			}
-			else
-			{
-				/* The task submitted previously were in read-only
-				 * mode: this task must depend on all those read-only
-				 * tasks and we get rid of the list of readers */
-				_STARPU_DEP_DEBUG("WAR %p\n", handle);
-				_starpu_add_writer_after_readers(handle, pre_sync_task, post_sync_task);
-			}
+			_STARPU_DEP_DEBUG("concurrently\n");
+			/* Can access concurrently with current tasks */
+			_starpu_add_accessor(handle, pre_sync_task, post_sync_task);
 		}
 		}
 		else
 		else
 		{
 		{
-			_STARPU_DEP_DEBUG("R %p %d -> %d\n", handle, previous_mode, mode);
-			/* Add a reader, after a writer or a reader. */
-			STARPU_ASSERT(pre_sync_task);
-			STARPU_ASSERT(post_sync_task);
+			/* Can not access concurrently, have to wait for existing accessors */
+			struct _starpu_task_wrapper_list *l = handle->last_submitted_accessors;
+			_STARPU_DEP_DEBUG("dependency\n");
 
 
-			STARPU_ASSERT(mode & (STARPU_R|STARPU_REDUX));
-
-			if (!(previous_mode & STARPU_W) && (mode != previous_mode))
+			if (l && l->next)
 			{
 			{
-				/* Read after Redux or Redux after Read: we
-				 * insert a dummy synchronization task so that
-				 * we don't need to have a gigantic number of
-				 * dependencies between all readers and all
-				 * redux tasks. */
-
-				/* Create an empty task */
-				struct starpu_task *new_sync_task;
-				new_sync_task = starpu_task_create();
-				STARPU_ASSERT(new_sync_task);
-				new_sync_task->cl = NULL;
+				/* Several previous accessors */
+
+				if (mode == STARPU_W)
+				{
+					/* Optimization: this task can not
+					 * combine with others anyway, use it
+					 * as synchronization task by making it
+					 * wait for the previous ones. */
+					_starpu_add_sync_task(handle, pre_sync_task, post_sync_task);
+				} else {
+					_STARPU_DEP_DEBUG("several predecessors, adding sync task\n");
+					/* insert an empty synchronization task
+					 * which waits for the whole set,
+					 * instead of creating a quadratic
+					 * number of dependencies. */
+					struct starpu_task *sync_task = starpu_task_create();
+					STARPU_ASSERT(sync_task);
+					sync_task->cl = NULL;
 #ifdef STARPU_USE_FXT
 #ifdef STARPU_USE_FXT
-				_starpu_get_job_associated_to_task(new_sync_task)->model_name = "sync_task_redux";
+					_starpu_get_job_associated_to_task(sync_task)->model_name = "sync_task_redux";
 #endif
 #endif
+					/* Make this task wait for the previous ones */
+					_starpu_add_sync_task(handle, sync_task, sync_task);
+					/* And the requested task wait for this one */
+					_starpu_add_accessor(handle, pre_sync_task, post_sync_task);
 
 
-				_starpu_add_writer_after_readers(handle, new_sync_task, new_sync_task);
-
-				task = new_sync_task;
+					task = sync_task;
+				}
+			}
+			else
+			{
+				if (l)
+				{
+					/* One previous accessor, make it the sync
+					 * task, and start depending on it. */
+					handle->last_sync_task = l->task;
+					handle->last_submitted_accessors = NULL;
+					free(l);
+				}
+				_starpu_add_accessor(handle, pre_sync_task, post_sync_task);
 			}
 			}
-			_starpu_add_reader_after_writer(handle, pre_sync_task, post_sync_task);
 		}
 		}
 		handle->last_submitted_mode = mode;
 		handle->last_submitted_mode = mode;
 	}
 	}
@@ -374,18 +344,18 @@ void _starpu_release_data_enforce_sequential_consistency(struct starpu_task *tas
 
 
 		/* If this is the last writer, there is no point in adding
 		/* If this is the last writer, there is no point in adding
 		 * extra deps to that tasks that does not exists anymore */
 		 * extra deps to that tasks that does not exists anymore */
-		if (task == handle->last_submitted_writer)
+		if (task == handle->last_sync_task)
 		{
 		{
-			handle->last_submitted_writer = NULL;
+			handle->last_sync_task = NULL;
 
 
 #ifndef STARPU_USE_FXT
 #ifndef STARPU_USE_FXT
 			if (_starpu_bound_recording)
 			if (_starpu_bound_recording)
 #endif
 #endif
 			{
 			{
 				/* Save the previous writer as the ghost last writer */
 				/* Save the previous writer as the ghost last writer */
-				handle->last_submitted_ghost_writer_id_is_valid = 1;
+				handle->last_submitted_ghost_sync_id_is_valid = 1;
 				struct _starpu_job *ghost_job = _starpu_get_job_associated_to_task(task);
 				struct _starpu_job *ghost_job = _starpu_get_job_associated_to_task(task);
-				handle->last_submitted_ghost_writer_id = ghost_job->job_id;
+				handle->last_submitted_ghost_sync_id = ghost_job->job_id;
 			}
 			}
 		}
 		}
 
 
@@ -397,7 +367,7 @@ void _starpu_release_data_enforce_sequential_consistency(struct starpu_task *tas
 		/* Same if this is one of the readers: we go through the list
 		/* Same if this is one of the readers: we go through the list
 		 * of readers and remove the task if it is found. */
 		 * of readers and remove the task if it is found. */
 		struct _starpu_task_wrapper_list *l;
 		struct _starpu_task_wrapper_list *l;
-		l = handle->last_submitted_readers;
+		l = handle->last_submitted_accessors;
 		struct _starpu_task_wrapper_list *prev = NULL;
 		struct _starpu_task_wrapper_list *prev = NULL;
 #ifdef STARPU_DEVEL
 #ifdef STARPU_DEVEL
 #warning TODO: use double-linked list to make finding ourself fast
 #warning TODO: use double-linked list to make finding ourself fast
@@ -419,9 +389,9 @@ void _starpu_release_data_enforce_sequential_consistency(struct starpu_task *tas
 					struct _starpu_job *ghost_reader_job = _starpu_get_job_associated_to_task(task);
 					struct _starpu_job *ghost_reader_job = _starpu_get_job_associated_to_task(task);
 					struct _starpu_jobid_list *link = (struct _starpu_jobid_list *) malloc(sizeof(struct _starpu_jobid_list));
 					struct _starpu_jobid_list *link = (struct _starpu_jobid_list *) malloc(sizeof(struct _starpu_jobid_list));
 					STARPU_ASSERT(link);
 					STARPU_ASSERT(link);
-					link->next = handle->last_submitted_ghost_readers_id;
+					link->next = handle->last_submitted_ghost_accessors_id;
 					link->id = ghost_reader_job->job_id;
 					link->id = ghost_reader_job->job_id;
-					handle->last_submitted_ghost_readers_id = link;
+					handle->last_submitted_ghost_accessors_id = link;
 				}
 				}
 
 
 				if (prev)
 				if (prev)
@@ -431,7 +401,7 @@ void _starpu_release_data_enforce_sequential_consistency(struct starpu_task *tas
 				else
 				else
 				{
 				{
 					/* This is the first element of the list */
 					/* This is the first element of the list */
-					handle->last_submitted_readers = next;
+					handle->last_submitted_accessors = next;
 				}
 				}
 
 
 				/* XXX can we really find the same task again
 				/* XXX can we really find the same task again

+ 6 - 0
src/core/jobs.h

@@ -51,6 +51,8 @@ typedef void (*_starpu_cl_func_t)(void **, void *);
 #define _STARPU_CPU_MAY_PERFORM(j)	((j)->task->cl->where & STARPU_CPU)
 #define _STARPU_CPU_MAY_PERFORM(j)	((j)->task->cl->where & STARPU_CPU)
 #define _STARPU_CUDA_MAY_PERFORM(j)      ((j)->task->cl->where & STARPU_CUDA)
 #define _STARPU_CUDA_MAY_PERFORM(j)      ((j)->task->cl->where & STARPU_CUDA)
 #define _STARPU_OPENCL_MAY_PERFORM(j)	((j)->task->cl->where & STARPU_OPENCL)
 #define _STARPU_OPENCL_MAY_PERFORM(j)	((j)->task->cl->where & STARPU_OPENCL)
+#define _STARPU_MIC_MAY_PERFORM(j)	((j)->task->cl->where & STARPU_MIC)
+#define _STARPU_SCC_MAY_PERFORM(j)	((j)->task->cl->where & STARPU_SCC)
 
 
 /* A job is the internal representation of a task. */
 /* A job is the internal representation of a task. */
 LIST_TYPE(_starpu_job,
 LIST_TYPE(_starpu_job,
@@ -116,6 +118,10 @@ LIST_TYPE(_starpu_job,
 	 * so we need a flag to differentiate them from "normal" tasks. */
 	 * so we need a flag to differentiate them from "normal" tasks. */
 	unsigned reduction_task;
 	unsigned reduction_task;
 
 
+	/* Used by MIC driver to record codelet start time instead of using a
+	 * local variable */
+	struct timespec cl_start;
+
 #ifdef STARPU_USE_FXT
 #ifdef STARPU_USE_FXT
 	/* A symbol name may be associated to the job directly for debug
 	/* A symbol name may be associated to the job directly for debug
 	 * purposes (for instance if the codelet is NULL). */
 	 * purposes (for instance if the codelet is NULL). */

+ 54 - 5
src/core/perfmodel/perfmodel_bus.c

@@ -67,6 +67,7 @@ static unsigned was_benchmarked = 0;
 static unsigned ncpus = 0;
 static unsigned ncpus = 0;
 static unsigned ncuda = 0;
 static unsigned ncuda = 0;
 static unsigned nopencl = 0;
 static unsigned nopencl = 0;
+static unsigned nmic = 0;
 
 
 /* Benchmarking the performance of the bus */
 /* Benchmarking the performance of the bus */
 
 
@@ -91,6 +92,11 @@ static double opencldev_latency_dtoh[STARPU_MAXNODES] = {0.0};
 static struct dev_timing opencldev_timing_per_cpu[STARPU_MAXNODES*STARPU_MAXCPUS];
 static struct dev_timing opencldev_timing_per_cpu[STARPU_MAXNODES*STARPU_MAXCPUS];
 #endif
 #endif
 
 
+#ifdef STARPU_USE_MIC
+static double mic_time_host_to_device[STARPU_MAXNODES] = {0.0};
+static double mic_time_device_to_host[STARPU_MAXNODES] = {0.0};
+#endif /* STARPU_USE_MIC */
+
 #ifdef STARPU_HAVE_HWLOC
 #ifdef STARPU_HAVE_HWLOC
 static hwloc_topology_t hwtopology;
 static hwloc_topology_t hwtopology;
 #endif
 #endif
@@ -632,7 +638,7 @@ static void benchmark_all_gpu_devices(void)
 	_STARPU_DISP("can not measure bus in simgrid mode, please run starpu_calibrate_bus in non-simgrid mode to make sure the bus performance model was calibrated\n");
 	_STARPU_DISP("can not measure bus in simgrid mode, please run starpu_calibrate_bus in non-simgrid mode to make sure the bus performance model was calibrated\n");
 	STARPU_ABORT();
 	STARPU_ABORT();
 #else /* !SIMGRID */
 #else /* !SIMGRID */
-#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
+#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL) || defined(STARPU_USE_MIC)
 	unsigned i;
 	unsigned i;
 #endif
 #endif
 #ifdef HAVE_CUDA_MEMCPY_PEER
 #ifdef HAVE_CUDA_MEMCPY_PEER
@@ -695,6 +701,19 @@ static void benchmark_all_gpu_devices(void)
 	}
 	}
 #endif
 #endif
 
 
+#ifdef STARPU_USE_MIC
+	/* TODO: implement real calibration ! For now we only put an arbitrary
+	 * value for each device during at the declaration as a bug fix, else
+	 * we get problems on heft scheduler */
+        nmic = _starpu_mic_src_get_device_count();
+
+	for (i = 0; i < STARPU_MAXNODES; i++)
+	{
+		mic_time_host_to_device[i] = 0.1;
+		mic_time_device_to_host[i] = 0.1;
+	}
+#endif /* STARPU_USE_MIC */
+
 #ifdef STARPU_HAVE_HWLOC
 #ifdef STARPU_HAVE_HWLOC
 	hwloc_set_cpubind(hwtopology, former_cpuset, HWLOC_CPUBIND_THREAD);
 	hwloc_set_cpubind(hwtopology, former_cpuset, HWLOC_CPUBIND_THREAD);
 #elif __linux__
 #elif __linux__
@@ -1082,6 +1101,9 @@ static void write_bus_latency_file_content(void)
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
         maxnode += nopencl;
         maxnode += nopencl;
 #endif
 #endif
+#ifdef STARPU_USE_MIC
+        maxnode += nmic;
+#endif
         for (src = 0; src < STARPU_MAXNODES; src++)
         for (src = 0; src < STARPU_MAXNODES; src++)
 	{
 	{
 		for (dst = 0; dst < STARPU_MAXNODES; dst++)
 		for (dst = 0; dst < STARPU_MAXNODES; dst++)
@@ -1290,6 +1312,9 @@ static void write_bus_bandwidth_file_content(void)
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
         maxnode += nopencl;
         maxnode += nopencl;
 #endif
 #endif
+#ifdef STARPU_USE_MIC
+        maxnode += nmic;
+#endif
 	for (src = 0; src < STARPU_MAXNODES; src++)
 	for (src = 0; src < STARPU_MAXNODES; src++)
 	{
 	{
 		for (dst = 0; dst < STARPU_MAXNODES; dst++)
 		for (dst = 0; dst < STARPU_MAXNODES; dst++)
@@ -1300,7 +1325,7 @@ static void write_bus_bandwidth_file_content(void)
 			{
 			{
 				bandwidth = NAN;
 				bandwidth = NAN;
 			}
 			}
-#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL)
+#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL) || defined(STARPU_USE_MIC)
 			else if (src != dst)
 			else if (src != dst)
 			{
 			{
 				double slowness = 0.0;
 				double slowness = 0.0;
@@ -1319,12 +1344,19 @@ static void write_bus_bandwidth_file_content(void)
 						slowness += cudadev_timing_htod[dst];
 						slowness += cudadev_timing_htod[dst];
 				}
 				}
 #endif
 #endif
+				/* TODO: generalize computation */
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
-				if (src > ncuda)
+				if (src > ncuda && src <= ncuda + nopencl)
 					slowness += opencldev_timing_dtoh[src-ncuda];
 					slowness += opencldev_timing_dtoh[src-ncuda];
-				if (dst > ncuda)
+				if (dst > ncuda && dst <= ncuda + nopencl)
 					slowness += opencldev_timing_htod[dst-ncuda];
 					slowness += opencldev_timing_htod[dst-ncuda];
 #endif
 #endif
+#ifdef STARPU_USE_MIC
+				if (src > ncuda + nopencl)
+					slowness += mic_time_device_to_host[src - (ncuda + nopencl)];
+				if (dst > ncuda + nopencl)
+					slowness += mic_time_host_to_device[dst - (ncuda + nopencl)];
+#endif
 				bandwidth = 1.0/slowness;
 				bandwidth = 1.0/slowness;
 			}
 			}
 #endif
 #endif
@@ -1364,6 +1396,9 @@ void starpu_bus_print_bandwidth(FILE *f)
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
         maxnode += nopencl;
         maxnode += nopencl;
 #endif
 #endif
+#ifdef STARPU_USE_MIC
+        maxnode += nmic;
+#endif
 
 
 	fprintf(f, "from/to\t");
 	fprintf(f, "from/to\t");
 	fprintf(f, "RAM\t");
 	fprintf(f, "RAM\t");
@@ -1501,7 +1536,7 @@ static void check_bus_config_file(void)
 	{
 	{
                 FILE *f;
                 FILE *f;
                 int ret;
                 int ret;
-		unsigned read_cuda = -1, read_opencl = -1;
+		unsigned read_cuda = -1, read_opencl = -1, read_mic = -1;
                 unsigned read_cpus = -1;
                 unsigned read_cpus = -1;
 
 
                 // Loading configuration from file
                 // Loading configuration from file
@@ -1517,6 +1552,10 @@ static void check_bus_config_file(void)
 		ret = fscanf(f, "%d\t", &read_opencl);
 		ret = fscanf(f, "%d\t", &read_opencl);
 		STARPU_ASSERT(ret == 1);
 		STARPU_ASSERT(ret == 1);
                 _starpu_drop_comments(f);
                 _starpu_drop_comments(f);
+		ret = fscanf(f, "%d\t", &read_mic);
+		if (ret == 0)
+			read_mic = 0;
+                _starpu_drop_comments(f);
                 fclose(f);
                 fclose(f);
 
 
                 // Loading current configuration
                 // Loading current configuration
@@ -1527,6 +1566,9 @@ static void check_bus_config_file(void)
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
                 nopencl = _starpu_opencl_get_device_count();
                 nopencl = _starpu_opencl_get_device_count();
 #endif
 #endif
+#ifdef STARPU_USE_MIC
+                nmic = _starpu_mic_src_get_device_count();
+#endif /* STARPU_USE_MIC */
 
 
                 // Checking if both configurations match
                 // Checking if both configurations match
                 if (read_cpus != ncpus)
                 if (read_cpus != ncpus)
@@ -1547,6 +1589,12 @@ static void check_bus_config_file(void)
                         _starpu_bus_force_sampling();
                         _starpu_bus_force_sampling();
 			_STARPU_DISP("... done\n");
 			_STARPU_DISP("... done\n");
                 }
                 }
+                else if (read_mic != nmic)
+		{
+                        _STARPU_DISP("Current configuration does not match the bus performance model (MIC: (stored) %d != (current) %d), recalibrating...\n", read_mic, nmic);
+                        _starpu_bus_force_sampling();
+			_STARPU_DISP("... done\n");
+                }
         }
         }
 }
 }
 
 
@@ -1567,6 +1615,7 @@ static void write_bus_config_file_content(void)
         fprintf(f, "%u # Number of CPUs\n", ncpus);
         fprintf(f, "%u # Number of CPUs\n", ncpus);
         fprintf(f, "%d # Number of CUDA devices\n", ncuda);
         fprintf(f, "%d # Number of CUDA devices\n", ncuda);
         fprintf(f, "%d # Number of OpenCL devices\n", nopencl);
         fprintf(f, "%d # Number of OpenCL devices\n", nopencl);
+        fprintf(f, "%d # Number of MIC devices\n", nmic);
 
 
         fclose(f);
         fclose(f);
 }
 }

+ 45 - 16
src/core/perfmodel/perfmodel_history.c

@@ -366,6 +366,22 @@ static void parse_model_file(FILE *f, struct starpu_perfmodel *model, unsigned s
 			   archmin + STARPU_MIN(narchs, STARPU_MAXOPENCLDEVS),
 			   archmin + STARPU_MIN(narchs, STARPU_MAXOPENCLDEVS),
 			   narchs > STARPU_MAXOPENCLDEVS ? narchs - STARPU_MAXOPENCLDEVS : 0);
 			   narchs > STARPU_MAXOPENCLDEVS ? narchs - STARPU_MAXOPENCLDEVS : 0);
 	}
 	}
+
+	/* Parsing MIC devs */
+	_starpu_drop_comments(f);
+	ret = fscanf(f, "%u\n", &narchs);
+	if (ret == 0)
+		narchs = 0;
+
+	archmin += STARPU_MAXOPENCLDEVS;
+	_STARPU_DEBUG("Parsing %u MIC devices\n", narchs);
+	if (narchs > 0)
+	{
+		parse_arch(f, model, scan_history,
+			   archmin,
+			   archmin + STARPU_MIN(narchs, STARPU_MAXMICDEVS),
+			   narchs > STARPU_MAXMICDEVS ? narchs - STARPU_MAXMICDEVS : 0);
+	}
 }
 }
 
 
 
 
@@ -447,6 +463,7 @@ static void dump_model_file(FILE *f, struct starpu_perfmodel *model)
 		{
 		{
 			case STARPU_CUDA_DEFAULT:
 			case STARPU_CUDA_DEFAULT:
 			case STARPU_OPENCL_DEFAULT:
 			case STARPU_OPENCL_DEFAULT:
+			case STARPU_MIC_DEFAULT:
 				arch_base = arch;
 				arch_base = arch;
 				idx++;
 				idx++;
 				break;
 				break;
@@ -479,42 +496,48 @@ static void dump_model_file(FILE *f, struct starpu_perfmodel *model)
 	}
 	}
 
 
 	/* Writing stuff */
 	/* Writing stuff */
+
 	char *name = "unknown";
 	char *name = "unknown";
 	unsigned substract_to_arch = 0;
 	unsigned substract_to_arch = 0;
 	for (arch = 0; arch < STARPU_NARCH_VARIATIONS; arch++)
 	for (arch = 0; arch < STARPU_NARCH_VARIATIONS; arch++)
 	{
 	{
+		unsigned char arch_already_visited = 0;
+
 		switch (arch)
 		switch (arch)
 		{
 		{
 			case STARPU_CPU_DEFAULT:
 			case STARPU_CPU_DEFAULT:
-				arch_base = arch;
 				name = "CPU";
 				name = "CPU";
-				fprintf(f, "##################\n");
-				fprintf(f, "# %ss\n", name);
-				fprintf(f, "# maximum number of %ss\n", name);
-				fprintf(f, "%u\n", my_narch = narch[0]);
+				my_narch = narch[0];
 				break;
 				break;
 			case STARPU_CUDA_DEFAULT:
 			case STARPU_CUDA_DEFAULT:
-				arch_base = arch;
 				name = "CUDA";
 				name = "CUDA";
 				substract_to_arch = STARPU_MAXCPUS;
 				substract_to_arch = STARPU_MAXCPUS;
-				fprintf(f, "##################\n");
-				fprintf(f, "# %ss\n", name);
-				fprintf(f, "# number of %s architectures\n", name);
-				fprintf(f, "%u\n", my_narch = narch[1]);
+				my_narch = narch[1];
 				break;
 				break;
 			case STARPU_OPENCL_DEFAULT:
 			case STARPU_OPENCL_DEFAULT:
-				arch_base = arch;
 				name = "OPENCL";
 				name = "OPENCL";
-				substract_to_arch += STARPU_MAXCUDADEVS;
-				fprintf(f, "##################\n");
-				fprintf(f, "# %ss\n", name);
-				fprintf(f, "# number of %s architectures\n", name);
-				fprintf(f, "%u\n", my_narch = narch[2]);
+				my_narch = narch[2];
+				break;
+			case STARPU_MIC_DEFAULT:
+				name = "MIC";
+				my_narch = narch[3];
 				break;
 				break;
 			default:
 			default:
+				/* The current worker arch was already written,
+				 * we don't need to write it again */
+				arch_already_visited = 1;
 				break;
 				break;
 		}
 		}
 
 
+		if (!arch_already_visited)
+		{
+			arch_base = arch;
+			fprintf(f, "##################\n");
+			fprintf(f, "# %ss\n", name);
+			fprintf(f, "# number of %s architectures\n", name);
+			fprintf(f, "%u\n", my_narch);
+		}
+
 		unsigned max_impl = 0;
 		unsigned max_impl = 0;
 		if (model->type == STARPU_HISTORY_BASED || model->type == STARPU_NL_REGRESSION_BASED)
 		if (model->type == STARPU_HISTORY_BASED || model->type == STARPU_NL_REGRESSION_BASED)
 		{
 		{
@@ -1024,6 +1047,12 @@ void starpu_perfmodel_get_arch_name(enum starpu_perfmodel_archtype arch, char *a
 		int devid = arch - STARPU_OPENCL_DEFAULT;
 		int devid = arch - STARPU_OPENCL_DEFAULT;
 		snprintf(archname, maxlen, "opencl_%d_impl_%u", devid,nimpl);
 		snprintf(archname, maxlen, "opencl_%d_impl_%u", devid,nimpl);
 	}
 	}
+	else if ((STARPU_MIC_DEFAULT <= arch)
+		&& (arch < STARPU_MIC_DEFAULT + STARPU_MAXMICDEVS))
+	{
+		int devid = arch - STARPU_MIC_DEFAULT;
+		snprintf(archname, maxlen, "mic_%d_impl_%u", devid, nimpl);
+	}
 	else
 	else
 	{
 	{
 		STARPU_ABORT();
 		STARPU_ABORT();

+ 2 - 1
src/core/sched_ctx.c

@@ -842,7 +842,7 @@ void* starpu_sched_ctx_get_policy_data(unsigned sched_ctx_id)
 	return sched_ctx->policy_data;
 	return sched_ctx->policy_data;
 }
 }
 
 
-struct starpu_worker_collection* starpu_sched_ctx_create_worker_collection(unsigned sched_ctx_id, int worker_collection_type)
+struct starpu_worker_collection* starpu_sched_ctx_create_worker_collection(unsigned sched_ctx_id, enum starpu_worker_collection_type  worker_collection_type)
 {
 {
 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
 	sched_ctx->workers = (struct starpu_worker_collection*)malloc(sizeof(struct starpu_worker_collection));
 	sched_ctx->workers = (struct starpu_worker_collection*)malloc(sizeof(struct starpu_worker_collection));
@@ -881,6 +881,7 @@ static unsigned _get_workers_list(struct _starpu_sched_ctx *sched_ctx, int **wor
 	}
 	}
 	return nworkers;
 	return nworkers;
 }
 }
+
 void starpu_sched_ctx_delete_worker_collection(unsigned sched_ctx_id)
 void starpu_sched_ctx_delete_worker_collection(unsigned sched_ctx_id)
 {
 {
 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);
 	struct _starpu_sched_ctx *sched_ctx = _starpu_get_sched_ctx_struct(sched_ctx_id);

+ 24 - 2
src/core/sched_policy.c

@@ -447,7 +447,7 @@ struct starpu_task *_starpu_create_conversion_task_for_arch(starpu_data_handle_t
 {
 {
 	struct starpu_task *conversion_task;
 	struct starpu_task *conversion_task;
 
 
-#if defined(STARPU_USE_OPENCL) || defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
+#if defined(STARPU_USE_OPENCL) || defined(STARPU_USE_CUDA) || defined(STARPU_USE_MIC) || defined(STARPU_USE_SCC) || defined(STARPU_SIMGRID)
 	struct starpu_multiformat_interface *format_interface;
 	struct starpu_multiformat_interface *format_interface;
 #endif
 #endif
 
 
@@ -455,7 +455,7 @@ struct starpu_task *_starpu_create_conversion_task_for_arch(starpu_data_handle_t
 	conversion_task->synchronous = 0;
 	conversion_task->synchronous = 0;
 	STARPU_TASK_SET_HANDLE(conversion_task, handle, 0);
 	STARPU_TASK_SET_HANDLE(conversion_task, handle, 0);
 
 
-#if defined(STARPU_USE_OPENCL) || defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
+#if defined(STARPU_USE_OPENCL) || defined(STARPU_USE_CUDA) || defined(STARPU_USE_MIC) || defined(STARPU_USE_SCC) || defined(STARPU_SIMGRID)
 	/* The node does not really matter here */
 	/* The node does not really matter here */
 	format_interface = (struct starpu_multiformat_interface *) starpu_data_get_interface_on_node(handle, 0);
 	format_interface = (struct starpu_multiformat_interface *) starpu_data_get_interface_on_node(handle, 0);
 #endif
 #endif
@@ -468,9 +468,13 @@ struct starpu_task *_starpu_create_conversion_task_for_arch(starpu_data_handle_t
 	switch(node_kind)
 	switch(node_kind)
 	{
 	{
 	case STARPU_CPU_RAM:
 	case STARPU_CPU_RAM:
+	case STARPU_SCC_RAM:
+	case STARPU_SCC_SHM:
 		switch (starpu_node_get_kind(handle->mf_node))
 		switch (starpu_node_get_kind(handle->mf_node))
 		{
 		{
 		case STARPU_CPU_RAM:
 		case STARPU_CPU_RAM:
+		case STARPU_SCC_RAM:
+		case STARPU_SCC_SHM:
 			STARPU_ABORT();
 			STARPU_ABORT();
 #if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
 #if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
 		case STARPU_CUDA_RAM:
 		case STARPU_CUDA_RAM:
@@ -490,6 +494,15 @@ struct starpu_task *_starpu_create_conversion_task_for_arch(starpu_data_handle_t
 			break;
 			break;
 		}
 		}
 #endif
 #endif
+#ifdef STARPU_USE_MIC
+		case STARPU_MIC_RAM:
+		{
+			struct starpu_multiformat_data_interface_ops *mf_ops;
+			mf_ops = (struct starpu_multiformat_data_interface_ops *) handle->ops->get_mf_ops(format_interface);
+			conversion_task->cl = mf_ops->mic_to_cpu_cl;
+			break;
+		}
+#endif
 		default:
 		default:
 			_STARPU_ERROR("Oops : %u\n", handle->mf_node);
 			_STARPU_ERROR("Oops : %u\n", handle->mf_node);
 		}
 		}
@@ -512,6 +525,15 @@ struct starpu_task *_starpu_create_conversion_task_for_arch(starpu_data_handle_t
 		break;
 		break;
 	}
 	}
 #endif
 #endif
+#ifdef STARPU_USE_MIC
+	case STARPU_MIC_RAM:
+	{
+		struct starpu_multiformat_data_interface_ops *mf_ops;
+		mf_ops = (struct starpu_multiformat_data_interface_ops *) handle->ops->get_mf_ops(format_interface);
+		conversion_task->cl = mf_ops->cpu_to_mic_cl;
+		break;
+	}
+#endif
 	default:
 	default:
 		STARPU_ABORT();
 		STARPU_ABORT();
 	}
 	}

+ 36 - 0
src/core/task.c

@@ -341,6 +341,21 @@ void _starpu_codelet_check_deprecated_fields(struct starpu_codelet *cl)
 	{
 	{
 		cl->where |= STARPU_OPENCL;
 		cl->where |= STARPU_OPENCL;
 	}
 	}
+
+	if (cl->mic_funcs[0] && is_where_unset)
+	{
+		cl->where |= STARPU_MIC;
+	}
+
+	if (cl->scc_funcs[0] && is_where_unset)
+	{
+		cl->where |= STARPU_SCC;
+	}
+
+	if (cl->cpu_funcs_name[0] && is_where_unset)
+	{
+		cl->where |= STARPU_MIC|STARPU_SCC;
+	}
 }
 }
 
 
 void _starpu_task_check_deprecated_fields(struct starpu_task *task)
 void _starpu_task_check_deprecated_fields(struct starpu_task *task)
@@ -876,6 +891,8 @@ _starpu_handle_needs_conversion_task_for_arch(starpu_data_handle_t handle,
 					return 0;
 					return 0;
 				case STARPU_CUDA_RAM:      /* Fall through */
 				case STARPU_CUDA_RAM:      /* Fall through */
 				case STARPU_OPENCL_RAM:
 				case STARPU_OPENCL_RAM:
+				case STARPU_MIC_RAM:
+				case STARPU_SCC_RAM:
 					return 1;
 					return 1;
 				default:
 				default:
 					STARPU_ABORT();
 					STARPU_ABORT();
@@ -883,12 +900,16 @@ _starpu_handle_needs_conversion_task_for_arch(starpu_data_handle_t handle,
 			break;
 			break;
 		case STARPU_CUDA_RAM:    /* Fall through */
 		case STARPU_CUDA_RAM:    /* Fall through */
 		case STARPU_OPENCL_RAM:
 		case STARPU_OPENCL_RAM:
+		case STARPU_MIC_RAM:
+		case STARPU_SCC_RAM:
 			switch(starpu_node_get_kind(handle->mf_node))
 			switch(starpu_node_get_kind(handle->mf_node))
 			{
 			{
 				case STARPU_CPU_RAM:
 				case STARPU_CPU_RAM:
 					return 1;
 					return 1;
 				case STARPU_CUDA_RAM:
 				case STARPU_CUDA_RAM:
 				case STARPU_OPENCL_RAM:
 				case STARPU_OPENCL_RAM:
+				case STARPU_MIC_RAM:
+				case STARPU_SCC_RAM:
 					return 0;
 					return 0;
 				default:
 				default:
 					STARPU_ABORT();
 					STARPU_ABORT();
@@ -925,3 +946,18 @@ unsigned starpu_task_get_implementation(struct starpu_task *task)
 {
 {
 	return _starpu_get_job_associated_to_task(task)->nimpl;
 	return _starpu_get_job_associated_to_task(task)->nimpl;
 }
 }
+
+starpu_mic_func_t _starpu_task_get_mic_nth_implementation(struct starpu_codelet *cl, unsigned nimpl)
+{
+	return cl->mic_funcs[nimpl];
+}
+
+starpu_scc_func_t _starpu_task_get_scc_nth_implementation(struct starpu_codelet *cl, unsigned nimpl)
+{
+	return cl->scc_funcs[nimpl];
+}
+
+char *_starpu_task_get_cpu_name_nth_implementation(struct starpu_codelet *cl, unsigned nimpl)
+{
+	return cl->cpu_funcs_name[nimpl];
+}

+ 4 - 0
src/core/task.h

@@ -72,6 +72,10 @@ void _starpu_codelet_check_deprecated_fields(struct starpu_codelet *cl);
 starpu_cpu_func_t _starpu_task_get_cpu_nth_implementation(struct starpu_codelet *cl, unsigned nimpl);
 starpu_cpu_func_t _starpu_task_get_cpu_nth_implementation(struct starpu_codelet *cl, unsigned nimpl);
 starpu_cuda_func_t _starpu_task_get_cuda_nth_implementation(struct starpu_codelet *cl, unsigned nimpl);
 starpu_cuda_func_t _starpu_task_get_cuda_nth_implementation(struct starpu_codelet *cl, unsigned nimpl);
 starpu_opencl_func_t _starpu_task_get_opencl_nth_implementation(struct starpu_codelet *cl, unsigned nimpl);
 starpu_opencl_func_t _starpu_task_get_opencl_nth_implementation(struct starpu_codelet *cl, unsigned nimpl);
+starpu_mic_func_t _starpu_task_get_mic_nth_implementation(struct starpu_codelet *cl, unsigned nimpl);
+starpu_scc_func_t _starpu_task_get_scc_nth_implementation(struct starpu_codelet *cl, unsigned nimpl);
+
+char *_starpu_task_get_cpu_name_nth_implementation(struct starpu_codelet *cl, unsigned nimpl);
 
 
 #define _STARPU_TASK_SET_INTERFACE(task, interface, i) do { if (task->dyn_handles) task->dyn_interfaces[i] = interface; else task->interfaces[i] = interface;} while(0)
 #define _STARPU_TASK_SET_INTERFACE(task, interface, i) do { if (task->dyn_handles) task->dyn_interfaces[i] = interface; else task->interfaces[i] = interface;} while(0)
 #define _STARPU_TASK_GET_INTERFACES(task) ((task->dyn_handles) ? task->dyn_interfaces : task->interfaces)
 #define _STARPU_TASK_GET_INTERFACES(task) ((task->dyn_handles) ? task->dyn_interfaces : task->interfaces)

+ 423 - 13
src/core/topology.c

@@ -23,6 +23,9 @@
 #include <core/debug.h>
 #include <core/debug.h>
 #include <core/topology.h>
 #include <core/topology.h>
 #include <drivers/cuda/driver_cuda.h>
 #include <drivers/cuda/driver_cuda.h>
+#include <drivers/mic/driver_mic_source.h>
+#include <drivers/scc/driver_scc_source.h>
+#include <drivers/mp_common/source_common.h>
 #include <drivers/opencl/driver_opencl.h>
 #include <drivers/opencl/driver_opencl.h>
 #include <profiling/profiling.h>
 #include <profiling/profiling.h>
 #include <common/uthash.h>
 #include <common/uthash.h>
@@ -45,7 +48,7 @@
 
 
 static unsigned topology_is_initialized = 0;
 static unsigned topology_is_initialized = 0;
 
 
-#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID)
+#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL) || defined(STARPU_USE_SCC) || defined(STARPU_SIMGRID)
 
 
 struct handle_entry
 struct handle_entry
 {
 {
@@ -67,9 +70,9 @@ static unsigned may_bind_automatically = 0;
  * Discover the topology of the machine
  * Discover the topology of the machine
  */
  */
 
 
-#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL) || defined(STARPU_SIMGRID)
+#if defined(STARPU_USE_CUDA) || defined(STARPU_USE_OPENCL) || defined(STARPU_USE_SCC)  || defined(STARPU_SIMGRID)
 static void
 static void
-_starpu_initialize_workers_gpuid (int *explicit_workers_gpuid,
+_starpu_initialize_workers_deviceid (int *explicit_workers_gpuid,
 				  int *current, int *workers_gpuid,
 				  int *current, int *workers_gpuid,
 				  const char *varname, unsigned nhwgpus)
 				  const char *varname, unsigned nhwgpus)
 {
 {
@@ -144,7 +147,8 @@ _starpu_initialize_workers_gpuid (int *explicit_workers_gpuid,
 			  workers_gpuid[i] = (unsigned)(i % nhwgpus);
 			  workers_gpuid[i] = (unsigned)(i % nhwgpus);
 
 
 		/* StarPU can use sampling techniques to bind threads
 		/* StarPU can use sampling techniques to bind threads
-		 * correctly */
+		 * correctly
+		 * TODO: use a private value for each kind of device */
 		may_bind_automatically = 1;
 		may_bind_automatically = 1;
 	}
 	}
 }
 }
@@ -157,7 +161,7 @@ _starpu_initialize_workers_cuda_gpuid (struct _starpu_machine_config *config)
 	struct starpu_machine_topology *topology = &config->topology;
 	struct starpu_machine_topology *topology = &config->topology;
 	struct starpu_conf *uconf = config->conf;
 	struct starpu_conf *uconf = config->conf;
 
 
-        _starpu_initialize_workers_gpuid (
+        _starpu_initialize_workers_deviceid (
 		uconf->use_explicit_workers_cuda_gpuid == 0
 		uconf->use_explicit_workers_cuda_gpuid == 0
 		? NULL
 		? NULL
 		: (int *)uconf->workers_cuda_gpuid,
 		: (int *)uconf->workers_cuda_gpuid,
@@ -184,7 +188,7 @@ _starpu_initialize_workers_opencl_gpuid (struct _starpu_machine_config*config)
 	struct starpu_machine_topology *topology = &config->topology;
 	struct starpu_machine_topology *topology = &config->topology;
 	struct starpu_conf *uconf = config->conf;
 	struct starpu_conf *uconf = config->conf;
 
 
-        _starpu_initialize_workers_gpuid(
+        _starpu_initialize_workers_deviceid(
 		uconf->use_explicit_workers_opencl_gpuid == 0
 		uconf->use_explicit_workers_opencl_gpuid == 0
 		? NULL
 		? NULL
 		: (int *)uconf->workers_opencl_gpuid,
 		: (int *)uconf->workers_opencl_gpuid,
@@ -258,6 +262,147 @@ _starpu_get_next_opencl_gpuid (struct _starpu_machine_config *config)
 }
 }
 #endif
 #endif
 
 
+#if 0
+#if defined(STARPU_USE_MIC) || defined(STARPU_SIMGRID)
+static void _starpu_initialize_workers_mic_deviceid(struct _starpu_machine_config *config)
+{
+	struct starpu_machine_topology *topology = &config->topology;
+	struct starpu_conf *uconf = config->conf;
+
+	_starpu_initialize_workers_deviceid(
+		uconf->use_explicit_workers_mic_deviceid == 0
+		? NULL
+		: (int *)config->user_conf->workers_mic_deviceid,
+		&(config->current_mic_deviceid),
+		(int *)topology->workers_mic_deviceid,
+		"STARPU_WORKERS_MICID",
+		topology->nhwmiccores);
+}
+#endif
+#endif
+
+#ifdef STARPU_USE_SCC
+static void _starpu_initialize_workers_scc_deviceid(struct _starpu_machine_config *config)
+{
+	struct starpu_machine_topology *topology = &config->topology;
+	struct starpu_conf *uconf = config->conf;
+
+	_starpu_initialize_workers_deviceid(
+		uconf->use_explicit_workers_scc_deviceid == 0
+		? NULL
+		: (int *) uconf->workers_scc_deviceid,
+		&(config->current_scc_deviceid),
+		(int *)topology->workers_scc_deviceid,
+		"STARPU_WORKERS_SCCID",
+		topology->nhwscc);
+}
+#endif /* STARPU_USE_SCC */
+
+#if 0
+#ifdef STARPU_USE_MIC
+static inline int _starpu_get_next_mic_deviceid(struct _starpu_machine_config *config)
+{
+	unsigned i = ((config->current_mic_deviceid++) % config->topology.nmicdevices);
+
+	return (int)config->topology.workers_mic_deviceid[i];
+}
+#endif
+#endif
+
+#ifdef STARPU_USE_SCC
+static inline int _starpu_get_next_scc_deviceid(struct _starpu_machine_config *config)
+{
+	unsigned i = ((config->current_scc_deviceid++) % config->topology.nsccdevices);
+
+	return (int)config->topology.workers_scc_deviceid[i];
+}
+#endif
+
+#ifdef STARPU_USE_MIC
+static void
+_starpu_init_mic_topology (struct _starpu_machine_config *config, long mic_idx)
+{
+	/* Discover the topology of the mic node identifier by MIC_IDX. That
+	 * means, make this StarPU instance aware of the number of cores available
+	 * on this MIC device. Update the `nhwmiccores' topology field
+	 * accordingly. */
+
+	struct starpu_machine_topology *topology = &config->topology;
+
+	int nbcores;
+	_starpu_src_common_sink_nbcores (mic_nodes[mic_idx], &nbcores);
+	topology->nhwmiccores[mic_idx] = nbcores;
+}
+
+
+static int
+_starpu_init_mic_node (struct _starpu_machine_config *config, int mic_idx,
+		       COIENGINE *coi_handle, COIPROCESS *coi_process)
+{
+	/* Initialize the MIC node of index MIC_IDX. */
+
+	struct starpu_conf *user_conf = config->conf;
+
+	char ***argv = _starpu_get_argv();
+	const char *suffixes[] = {"-mic", "_mic", NULL};
+
+	/* Environment variables to send to the Sink, it informs it what kind
+	 * of node it is (architecture and type) as there is no way to discover
+	 * it itself */
+	char mic_idx_env[32];
+	sprintf(mic_idx_env, "DEVID=%d", mic_idx);
+
+	/* XXX: this is currently necessary so that the remote process does not
+	 * segfault. */
+	char nb_mic_env[32];
+	sprintf(nb_mic_env, "NB_MIC=%d", 2);
+
+	const char *mic_sink_env[] = {"STARPU_SINK=STARPU_MIC", mic_idx_env, nb_mic_env, NULL};
+
+	char mic_sink_program_path[1024];
+	/* Let's get the helper program to run on the MIC device */
+	int mic_file_found =
+	    _starpu_src_common_locate_file (mic_sink_program_path,
+					    getenv("STARPU_MIC_SINK_PROGRAM_NAME"),
+					    getenv("STARPU_MIC_SINK_PROGRAM_PATH"),
+					    user_conf->mic_sink_program_path,
+					    (argv ? (*argv)[0] : NULL),
+					    suffixes);
+
+	if (0 != mic_file_found) {
+		fprintf(stderr, "No MIC program specified, use the environment"
+			"variable STARPU_MIC_SINK_PROGRAM_NAME or the environment"
+			"or the field 'starpu_conf.mic_sink_program_path'"
+			"to define it.\n");
+
+		return -1;
+	}
+
+	COIRESULT res;
+	/* Let's get the handle which let us manage the remote MIC device */
+	res = COIEngineGetHandle(COI_ISA_MIC, mic_idx, coi_handle);
+	if (STARPU_UNLIKELY(res != COI_SUCCESS))
+		STARPU_MIC_SRC_REPORT_COI_ERROR(res);
+
+	/* We launch the helper on the MIC device, which will wait for us
+	 * to give it work to do.
+	 * As we will communicate further with the device throught scif we
+	 * don't need to keep the process pointer */
+	res = COIProcessCreateFromFile(*coi_handle, mic_sink_program_path, 0, NULL, 0,
+				       mic_sink_env, 1, NULL, 0, NULL,
+				       coi_process);
+	if (STARPU_UNLIKELY(res != COI_SUCCESS))
+		STARPU_MIC_SRC_REPORT_COI_ERROR(res);
+
+	/* Let's create the node structure, we'll communicate with the peer
+	 * through scif thanks to it */
+	mic_nodes[mic_idx] =
+		_starpu_mp_common_node_create(STARPU_MIC_SOURCE, mic_idx);
+
+	return 0;
+}
+#endif
+
 
 
 static void
 static void
 _starpu_init_topology (struct _starpu_machine_config *config)
 _starpu_init_topology (struct _starpu_machine_config *config)
@@ -284,6 +429,9 @@ _starpu_init_topology (struct _starpu_machine_config *config)
 	_starpu_cpu_discover_devices(config);
 	_starpu_cpu_discover_devices(config);
 	_starpu_cuda_discover_devices(config);
 	_starpu_cuda_discover_devices(config);
 	_starpu_opencl_discover_devices(config);
 	_starpu_opencl_discover_devices(config);
+#ifdef STARPU_USE_SCC
+	config->topology.nhwscc = _starpu_scc_src_get_device_count();
+#endif
 
 
 	topology_is_initialized = 1;
 	topology_is_initialized = 1;
 }
 }
@@ -434,8 +582,137 @@ _starpu_topology_get_nhwcpu (struct _starpu_machine_config *config)
 	return config->topology.nhwcpus;
 	return config->topology.nhwcpus;
 }
 }
 
 
+#ifdef STARPU_USE_MIC
+static void
+_starpu_init_mic_config (struct _starpu_machine_config *config,
+			 struct starpu_conf *user_conf,
+			 unsigned mic_idx)
+{
+	// Configure the MIC device of index MIC_IDX.
+
+	struct starpu_machine_topology *topology = &config->topology;
+
+	topology->nhwmiccores[mic_idx] = 0;
+
+	_starpu_init_mic_topology (config, mic_idx);
+
+	int nmiccores;
+	nmiccores = starpu_get_env_number("STARPU_NMIC");
+
+	/* STARPU_NMIC is not set. Did the user specify anything ? */
+	if (nmiccores == -1 && user_conf)
+		nmiccores = user_conf->nmic;
+
+	if (nmiccores != 0)
+	{
+		if (nmiccores == -1)
+		{
+			/* Nothing was specified, so let's use the number of
+			 * detected mic cores. ! */
+			nmiccores = topology->nhwmiccores[mic_idx];
+		    }
+		else
+		{
+			if ((unsigned) nmiccores > topology->nhwmiccores[mic_idx])
+			{
+				/* The user requires more MIC devices than there is available */
+				fprintf(stderr,
+					"# Warning: %d MIC devices requested. Only %d available.\n",
+					nmiccores, topology->nhwmiccores[mic_idx]);
+				nmiccores = topology->nhwmiccores[mic_idx];
+			}
+		}
+	}
+
+	topology->nmiccores[mic_idx] = nmiccores;
+	STARPU_ASSERT(topology->nmiccores[mic_idx] + topology->nworkers <= STARPU_NMAXWORKERS);
+
+	/* _starpu_initialize_workers_mic_deviceid (config); */
+
+	unsigned miccore_id;
+	for (miccore_id = 0; miccore_id < topology->nmiccores[mic_idx]; miccore_id++)
+	{
+		int worker_idx = topology->nworkers + miccore_id;
+		enum starpu_perfmodel_archtype arch =
+			(enum starpu_perfmodel_archtype)((int)STARPU_MIC_DEFAULT + devid);
+		config->workers[worker_idx].arch = STARPU_MIC_WORKER;
+		config->workers[worker_idx].perf_arch = arch;
+		config->workers[worker_idx].mp_nodeid = mic_idx;
+		config->workers[worker_idx].devid = miccore_id;
+		config->workers[worker_idx].worker_mask = STARPU_MIC;
+		config->worker_mask |= STARPU_MIC;
+		_starpu_init_sched_ctx_for_worker(config->workers[worker_idx].workerid);
+	}
+
+	topology->nworkers += topology->nmiccores[mic_idx];
+    }
+
+
+#ifdef STARPU_USE_MIC
+static COIENGINE handles[2];
+static COIPROCESS process[2];
+#endif
+
+static void
+_starpu_init_mp_config (struct _starpu_machine_config *config,
+			struct starpu_conf *user_conf)
+{
+	/* Discover and configure the mp topology. That means:
+	 * - discover the number of mp nodes;
+	 * - initialize each discovered node;
+	 * - discover the local topology (number of PUs/devices) of each node;
+	 * - configure the workers accordingly.
+	 */
+
+	struct starpu_machine_topology *topology = &config->topology;
+
+	// We currently only support MIC at this level.
+#ifdef STARPU_USE_MIC
+
+	/* Discover and initialize the number of MIC nodes through the mp
+	 * infrastructure. */
+	unsigned nhwmicdevices = _starpu_mic_src_get_device_count();
+
+	int reqmicdevices = starpu_get_env_number("STARPU_NMICDEVS");
+	if (-1 == reqmicdevices)
+		reqmicdevices = nhwmicdevices;
+
+	topology->nmicdevices = 0;
+	unsigned i;
+	for (i = 0; i < STARPU_MIN (nhwmicdevices, (unsigned) reqmicdevices); i++)
+		if (0 == _starpu_init_mic_node (config, i, &handles[i], &process[i]))
+			topology->nmicdevices++;
+
+	i = 0;
+	for (; i < topology->nmicdevices; i++)
+		_starpu_init_mic_config (config, user_conf, i);
+#endif
+}
+
+static void
+_starpu_deinit_mic_node (unsigned mic_idx)
+{
+	_starpu_mp_common_send_command(mic_nodes[mic_idx], STARPU_EXIT, NULL, 0);
+
+	COIProcessDestroy(process[mic_idx], -1, 0, NULL, NULL);
+
+	_starpu_mp_common_node_destroy(mic_nodes[mic_idx]);
+}
+
+static void
+_starpu_deinit_mp_config (struct _starpu_machine_config *config)
+{
+	struct starpu_machine_topology *topology = &config->topology;
+	unsigned i;
+
+	for (i = 0; i < topology->nmicdevices; i++)
+		_starpu_deinit_mic_node (i);
+	_starpu_mic_clear_kernels();
+}
+#endif
+
 static int
 static int
-_starpu_init_machine_config (struct _starpu_machine_config *config)
+_starpu_init_machine_config (struct _starpu_machine_config *config, int no_mp_config)
 {
 {
 	int i;
 	int i;
 	for (i = 0; i < STARPU_NMAXWORKERS; i++)
 	for (i = 0; i < STARPU_NMAXWORKERS; i++)
@@ -498,6 +775,7 @@ _starpu_init_machine_config (struct _starpu_machine_config *config)
 		int devid = _starpu_get_next_cuda_gpuid(config);
 		int devid = _starpu_get_next_cuda_gpuid(config);
 		enum starpu_perfmodel_archtype arch =
 		enum starpu_perfmodel_archtype arch =
 			(enum starpu_perfmodel_archtype)((int)STARPU_CUDA_DEFAULT + devid);
 			(enum starpu_perfmodel_archtype)((int)STARPU_CUDA_DEFAULT + devid);
+		config->workers[worker_idx].mp_nodeid = -1;
 		config->workers[worker_idx].devid = devid;
 		config->workers[worker_idx].devid = devid;
 		config->workers[worker_idx].perf_arch = arch;
 		config->workers[worker_idx].perf_arch = arch;
 		config->workers[worker_idx].worker_mask = STARPU_CUDA;
 		config->workers[worker_idx].worker_mask = STARPU_CUDA;
@@ -572,6 +850,7 @@ _starpu_init_machine_config (struct _starpu_machine_config *config)
 		config->workers[worker_idx].arch = STARPU_OPENCL_WORKER;
 		config->workers[worker_idx].arch = STARPU_OPENCL_WORKER;
 		enum starpu_perfmodel_archtype arch =
 		enum starpu_perfmodel_archtype arch =
 			(enum starpu_perfmodel_archtype)((int)STARPU_OPENCL_DEFAULT + devid);
 			(enum starpu_perfmodel_archtype)((int)STARPU_OPENCL_DEFAULT + devid);
+		config->workers[worker_idx].mp_nodeid = -1;
 		config->workers[worker_idx].devid = devid;
 		config->workers[worker_idx].devid = devid;
 		config->workers[worker_idx].perf_arch = arch;
 		config->workers[worker_idx].perf_arch = arch;
 		config->workers[worker_idx].worker_mask = STARPU_OPENCL;
 		config->workers[worker_idx].worker_mask = STARPU_OPENCL;
@@ -582,6 +861,78 @@ _starpu_init_machine_config (struct _starpu_machine_config *config)
 	topology->nworkers += topology->nopenclgpus;
 	topology->nworkers += topology->nopenclgpus;
 #endif
 #endif
 
 
+#ifdef STARPU_USE_SCC
+	int nscc = config->conf->nscc;
+
+	unsigned nb_scc_nodes = _starpu_scc_src_get_device_count();
+
+	if (nscc != 0)
+	{
+		/* The user did not disable SCC. We need to count
+		 * the number of devices */
+		int nb_devices = nb_scc_nodes;
+
+		if (nscc == -1)
+		{
+			/* Nothing was specified, so let's choose ! */
+			nscc = nb_devices;
+			if (nscc > STARPU_MAXSCCDEVS)
+			{
+				_STARPU_DISP("Warning: %d SCC devices available. Only %d enabled. Use configuration option --enable-maxsccdev=xxx to update the maximum value of supported SCC devices.\n", nb_devices, STARPU_MAXSCCDEVS);
+				nscc = STARPU_MAXSCCDEVS;
+			}
+		}
+		else
+		{
+			/* Let's make sure this value is OK. */
+			if (nscc > nb_devices)
+			{
+				/* The user requires more SCC devices than there is available */
+				_STARPU_DISP("Warning: %d SCC devices requested. Only %d available.\n", nscc, nb_devices);
+				nscc = nb_devices;
+			}
+			/* Let's make sure this value is OK. */
+			if (nscc > STARPU_MAXSCCDEVS)
+			{
+				_STARPU_DISP("Warning: %d SCC devices requested. Only %d enabled. Use configure option --enable-maxsccdev=xxx to update the maximum value of supported SCC devices.\n", nscc, STARPU_MAXSCCDEVS);
+				nscc = STARPU_MAXSCCDEVS;
+			}
+		}
+	}
+
+	/* Now we know how many SCC devices will be used */
+	topology->nsccdevices = nscc;
+	STARPU_ASSERT(topology->nsccdevices + topology->nworkers <= STARPU_NMAXWORKERS);
+
+	_starpu_initialize_workers_scc_deviceid(config);
+
+	unsigned sccdev;
+	for (sccdev = 0; sccdev < topology->nsccdevices; sccdev++)
+	{
+		config->workers[topology->nworkers + sccdev].arch = STARPU_SCC_WORKER;
+		int devid = _starpu_get_next_scc_deviceid(config);
+		enum starpu_perfmodel_archtype arch = (enum starpu_perfmodel_archtype)((int)STARPU_SCC_DEFAULT + devid);
+		config->workers[topology->nworkers + sccdev].mp_nodeid = -1;
+		config->workers[topology->nworkers + sccdev].devid = devid;
+		config->workers[topology->nworkers + sccdev].perf_arch = arch;
+		config->workers[topology->nworkers + sccdev].worker_mask = STARPU_SCC;
+		config->worker_mask |= STARPU_SCC;
+	}
+
+	for (; sccdev < nb_scc_nodes; ++sccdev)
+		_starpu_scc_exit_useless_node(sccdev);
+
+	topology->nworkers += topology->nsccdevices;
+#endif /* STARPU_USE_SCC */
+
+
+	/* Unless not requested, we need to complete configuration with the
+	 * ones of the mp nodes. */
+#ifdef STARPU_USE_MIC
+	if (! no_mp_config)
+	    _starpu_init_mp_config (config, config->conf);
+#endif
+
 /* we put the CPU section after the accelerator : in case there was an
 /* we put the CPU section after the accelerator : in case there was an
  * accelerator found, we devote one cpu */
  * accelerator found, we devote one cpu */
 #if defined(STARPU_USE_CPU) || defined(STARPU_SIMGRID)
 #if defined(STARPU_USE_CPU) || defined(STARPU_SIMGRID)
@@ -591,8 +942,15 @@ _starpu_init_machine_config (struct _starpu_machine_config *config)
 	{
 	{
 		if (ncpu == -1)
 		if (ncpu == -1)
 		{
 		{
-			unsigned already_busy_cpus = topology->ncudagpus + topology->nopenclgpus;
-			long avail_cpus = topology->nhwcpus - already_busy_cpus;
+			unsigned mic_busy_cpus = 0;
+			unsigned i = 0;
+			for (i = 0; i < STARPU_MAXMICDEVS; i++)
+				mic_busy_cpus += (topology->nmiccores[i] ? 1 : 0);
+
+			unsigned already_busy_cpus = mic_busy_cpus + topology->ncudagpus
+				+ topology->nopenclgpus + topology->nsccdevices;
+
+			long avail_cpus = (long) topology->nhwcpus - (long) already_busy_cpus;
 			if (avail_cpus < 0)
 			if (avail_cpus < 0)
 				avail_cpus = 0;
 				avail_cpus = 0;
 			ncpu = STARPU_MIN(avail_cpus, STARPU_MAXCPUS);
 			ncpu = STARPU_MIN(avail_cpus, STARPU_MAXCPUS);
@@ -617,6 +975,7 @@ _starpu_init_machine_config (struct _starpu_machine_config *config)
 		int worker_idx = topology->nworkers + cpu;
 		int worker_idx = topology->nworkers + cpu;
 		config->workers[worker_idx].arch = STARPU_CPU_WORKER;
 		config->workers[worker_idx].arch = STARPU_CPU_WORKER;
 		config->workers[worker_idx].perf_arch = STARPU_CPU_DEFAULT;
 		config->workers[worker_idx].perf_arch = STARPU_CPU_DEFAULT;
+		config->workers[worker_idx].mp_nodeid = -1;
 		config->workers[worker_idx].devid = cpu;
 		config->workers[worker_idx].devid = cpu;
 		config->workers[worker_idx].worker_mask = STARPU_CPU;
 		config->workers[worker_idx].worker_mask = STARPU_CPU;
 		config->worker_mask |= STARPU_CPU;
 		config->worker_mask |= STARPU_CPU;
@@ -745,7 +1104,7 @@ _starpu_bind_thread_on_cpus (
 
 
 
 
 static void
 static void
-_starpu_init_workers_binding (struct _starpu_machine_config *config)
+_starpu_init_workers_binding (struct _starpu_machine_config *config, int no_mp_config)
 {
 {
 	/* launch one thread per CPU */
 	/* launch one thread per CPU */
 	unsigned ram_memory_node;
 	unsigned ram_memory_node;
@@ -770,6 +1129,21 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config)
 	 * combinations in a matrix which we initialize here. */
 	 * combinations in a matrix which we initialize here. */
 	_starpu_initialize_busid_matrix();
 	_starpu_initialize_busid_matrix();
 
 
+#ifdef STARPU_USE_MIC
+	/* Each MIC device has its own memory node. */
+	unsigned mic_memory_nodes[STARPU_MAXMICDEVS];
+
+	// Register the memory nodes for the MIC devices.
+	if (! no_mp_config) {
+	    unsigned i = 0;
+	    for (i = 0; i < config->topology.nmicdevices; i++) {
+		mic_memory_nodes[i] = _starpu_memory_node_register (STARPU_MIC_RAM, i);
+		_starpu_register_bus(0, mic_memory_nodes[i]);
+		_starpu_register_bus(mic_memory_nodes[i], 0);
+	    }
+	}
+#endif
+
 	unsigned worker;
 	unsigned worker;
 	for (worker = 0; worker < config->topology.nworkers; worker++)
 	for (worker = 0; worker < config->topology.nworkers; worker++)
 	{
 	{
@@ -852,6 +1226,38 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config)
 				break;
 				break;
 #endif
 #endif
 
 
+#ifdef STARPU_USE_MIC
+		        case STARPU_MIC_WORKER:
+				//if (may_bind_automatically)
+				//{
+				//	/* StarPU is allowed to bind threads automatically */
+				//	preferred_binding = _starpu_get_mic_affinity_vector(workerarg->devid);
+				//	npreferred = config->topology.nhwcpus;
+				//}
+				is_a_set_of_accelerators = 1;
+				memory_node = mic_memory_nodes[workerarg->mp_nodeid];
+				_starpu_memory_node_add_nworkers(memory_node);
+				/* memory_node = _starpu_memory_node_register(STARPU_MIC_RAM, workerarg->devid);*/
+
+				/* _starpu_register_bus(0, memory_node);
+				 * _starpu_register_bus(memory_node, 0); */
+				break;
+#endif /* STARPU_USE_MIC */
+
+#ifdef STARPU_USE_SCC
+			case STARPU_SCC_WORKER:
+			{
+				/* Node 0 represents the SCC shared memory when we're on SCC. */
+				struct _starpu_memory_node_descr *descr = _starpu_memory_node_get_description();
+				descr->nodes[ram_memory_node] = STARPU_SCC_SHM;
+
+				is_a_set_of_accelerators = 0;
+				memory_node = ram_memory_node;
+				_starpu_memory_node_add_nworkers(memory_node);
+			}
+				break;
+#endif
+
 			default:
 			default:
 				STARPU_ABORT();
 				STARPU_ABORT();
 		}
 		}
@@ -902,18 +1308,18 @@ _starpu_init_workers_binding (struct _starpu_machine_config *config)
 
 
 
 
 int
 int
-_starpu_build_topology (struct _starpu_machine_config *config)
+_starpu_build_topology (struct _starpu_machine_config *config, int no_mp_config)
 {
 {
 	int ret;
 	int ret;
 
 
-	ret = _starpu_init_machine_config(config);
+	ret = _starpu_init_machine_config(config, no_mp_config);
 	if (ret)
 	if (ret)
 		return ret;
 		return ret;
 
 
 	/* for the data management library */
 	/* for the data management library */
 	_starpu_memory_nodes_init();
 	_starpu_memory_nodes_init();
 
 
-	_starpu_init_workers_binding(config);
+	_starpu_init_workers_binding(config, no_mp_config);
 
 
 	return 0;
 	return 0;
 }
 }
@@ -922,6 +1328,10 @@ void
 _starpu_destroy_topology (
 _starpu_destroy_topology (
 	struct _starpu_machine_config *config __attribute__ ((unused)))
 	struct _starpu_machine_config *config __attribute__ ((unused)))
 {
 {
+#ifdef STARPU_USE_MIC
+	_starpu_deinit_mp_config(config);
+#endif
+
 	/* cleanup StarPU internal data structures */
 	/* cleanup StarPU internal data structures */
 	_starpu_memory_nodes_deinit();
 	_starpu_memory_nodes_deinit();
 
 

+ 1 - 1
src/core/topology.h

@@ -27,7 +27,7 @@
 struct _starpu_machine_config;
 struct _starpu_machine_config;
 
 
 /* Detect the number of memory nodes and where to bind the different workers. */
 /* Detect the number of memory nodes and where to bind the different workers. */
-int _starpu_build_topology(struct _starpu_machine_config *config);
+int _starpu_build_topology(struct _starpu_machine_config *config, int no_mp_config);
 
 
 /* Destroy all resources used to store the topology of the machine. */
 /* Destroy all resources used to store the topology of the machine. */
 void _starpu_destroy_topology(struct _starpu_machine_config *config);
 void _starpu_destroy_topology(struct _starpu_machine_config *config);

+ 261 - 10
src/core/workers.c

@@ -28,6 +28,8 @@
 #include <core/task.h>
 #include <core/task.h>
 #include <profiling/profiling.h>
 #include <profiling/profiling.h>
 #include <starpu_task_list.h>
 #include <starpu_task_list.h>
+#include <drivers/mp_common/sink_common.h>
+#include <drivers/scc/driver_scc_common.h>
 
 
 #include <drivers/cpu/driver_cpu.h>
 #include <drivers/cpu/driver_cpu.h>
 #include <drivers/cuda/driver_cuda.h>
 #include <drivers/cuda/driver_cuda.h>
@@ -51,6 +53,29 @@ static starpu_pthread_key_t worker_key;
 
 
 static struct _starpu_machine_config config;
 static struct _starpu_machine_config config;
 
 
+/* Pointers to argc and argv
+ */
+static int *my_argc = 0;
+static char ***my_argv = NULL;
+
+/* Initialize value of static argc and argv, called when the process begins
+ */
+void _starpu_set_argc_argv(int *argc_param, char ***argv_param)
+{
+	my_argc = argc_param;
+	my_argv = argv_param;
+}
+
+int *_starpu_get_argc()
+{
+	return my_argc;
+}
+
+char ***_starpu_get_argv()
+{
+	return my_argv;
+}
+
 int _starpu_is_initialized(void)
 int _starpu_is_initialized(void)
 {
 {
 	return initialized == INITIALIZED;
 	return initialized == INITIALIZED;
@@ -98,6 +123,14 @@ static uint32_t _starpu_worker_exists_and_can_execute(struct starpu_task *task,
 				if (task->cl->opencl_funcs[impl] != NULL)
 				if (task->cl->opencl_funcs[impl] != NULL)
 					test_implementation = 1;
 					test_implementation = 1;
 				break;
 				break;
+			case STARPU_MIC_WORKER:
+				if (task->cl->cpu_funcs_name[impl] != NULL || task->cl->mic_funcs[impl] != NULL)
+					test_implementation = 1;
+				break;
+			case STARPU_SCC_WORKER:
+				if (task->cl->cpu_funcs_name[impl] != NULL || task->cl->scc_funcs[impl] != NULL)
+					test_implementation = 1;
+				break;
 			default:
 			default:
 				STARPU_ABORT();
 				STARPU_ABORT();
 			}
 			}
@@ -140,6 +173,16 @@ uint32_t _starpu_worker_exists(struct starpu_task *task)
 	    _starpu_worker_exists_and_can_execute(task, STARPU_OPENCL_WORKER))
 	    _starpu_worker_exists_and_can_execute(task, STARPU_OPENCL_WORKER))
 		return 1;
 		return 1;
 #endif
 #endif
+#ifdef STARPU_USE_MIC
+	if ((task->cl->where & STARPU_MIC) &&
+	    _starpu_worker_exists_and_can_execute(task, STARPU_MIC_WORKER))
+		return 1;
+#endif
+#ifdef STARPU_USE_SCC
+	if ((task->cl->where & STARPU_SCC) &&
+	    _starpu_worker_exists_and_can_execute(task, STARPU_SCC_WORKER))
+		return 1;
+#endif
 	return 0;
 	return 0;
 }
 }
 
 
@@ -158,6 +201,11 @@ uint32_t _starpu_can_submit_opencl_task(void)
 	return (STARPU_OPENCL & config.worker_mask);
 	return (STARPU_OPENCL & config.worker_mask);
 }
 }
 
 
+uint32_t _starpu_can_submit_scc_task(void)
+{
+	return (STARPU_SCC & config.worker_mask);
+}
+
 static int _starpu_can_use_nth_implementation(enum starpu_worker_archtype arch, struct starpu_codelet *cl, unsigned nimpl)
 static int _starpu_can_use_nth_implementation(enum starpu_worker_archtype arch, struct starpu_codelet *cl, unsigned nimpl)
 {
 {
 	switch(arch)
 	switch(arch)
@@ -196,13 +244,26 @@ static int _starpu_can_use_nth_implementation(enum starpu_worker_archtype arch,
 		starpu_opencl_func_t func = _starpu_task_get_opencl_nth_implementation(cl, nimpl);
 		starpu_opencl_func_t func = _starpu_task_get_opencl_nth_implementation(cl, nimpl);
 		return func != NULL;
 		return func != NULL;
 	}
 	}
+	case STARPU_MIC_WORKER:
+	{
+		starpu_mic_func_t func = _starpu_task_get_mic_nth_implementation(cl, nimpl);
+		char *func_name = _starpu_task_get_cpu_name_nth_implementation(cl, nimpl);
+
+		return func != NULL || func_name != NULL;
+	}
+	case STARPU_SCC_WORKER:
+	{
+		starpu_scc_func_t func = _starpu_task_get_scc_nth_implementation(cl, nimpl);
+		char *func_name = _starpu_task_get_cpu_name_nth_implementation(cl, nimpl);
+
+		return func != NULL || func_name != NULL;
+	}
 	default:
 	default:
 		STARPU_ASSERT_MSG(0, "Unknown arch type %d", arch);
 		STARPU_ASSERT_MSG(0, "Unknown arch type %d", arch);
 	}
 	}
 	return 0;
 	return 0;
 }
 }
 
 
-
 int starpu_worker_can_execute_task(unsigned workerid, struct starpu_task *task, unsigned nimpl)
 int starpu_worker_can_execute_task(unsigned workerid, struct starpu_task *task, unsigned nimpl)
 {
 {
 	/* TODO: check that the task operand sizes will fit on that device */
 	/* TODO: check that the task operand sizes will fit on that device */
@@ -255,6 +316,10 @@ int starpu_combined_worker_can_execute_task(unsigned workerid, struct starpu_tas
  * Runtime initialization methods
  * Runtime initialization methods
  */
  */
 
 
+#ifdef STARPU_USE_MIC
+static struct _starpu_worker_set mic_worker_set[STARPU_MAXMICDEVS];
+#endif
+
 static void _starpu_init_worker_queue(struct _starpu_worker *workerarg)
 static void _starpu_init_worker_queue(struct _starpu_worker *workerarg)
 {
 {
 	starpu_pthread_cond_t *cond = &workerarg->sched_cond;
 	starpu_pthread_cond_t *cond = &workerarg->sched_cond;
@@ -374,6 +439,9 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 	for (worker = 0; worker < nworkers; worker++)
 	for (worker = 0; worker < nworkers; worker++)
 	{
 	{
 		struct _starpu_worker *workerarg = &pconfig->workers[worker];
 		struct _starpu_worker *workerarg = &pconfig->workers[worker];
+#ifdef STARPU_USE_MIC
+		unsigned mp_nodeid = workerarg->mp_nodeid;
+#endif
 
 
 		workerarg->config = pconfig;
 		workerarg->config = pconfig;
 
 
@@ -393,6 +461,7 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 		workerarg->run_by_starpu = 1;
 		workerarg->run_by_starpu = 1;
 		workerarg->worker_is_running = 0;
 		workerarg->worker_is_running = 0;
 		workerarg->worker_is_initialized = 0;
 		workerarg->worker_is_initialized = 0;
+		workerarg->set = NULL;
 
 
 		int ctx;
 		int ctx;
 		for(ctx = 0; ctx < STARPU_NMAX_SCHED_CTXS; ctx++)
 		for(ctx = 0; ctx < STARPU_NMAX_SCHED_CTXS; ctx++)
@@ -415,7 +484,7 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 
 
 		workerarg->status = STATUS_INITIALIZING;
 		workerarg->status = STATUS_INITIALIZING;
 
 
-		_STARPU_DEBUG("initialising worker %u\n", worker);
+		_STARPU_DEBUG("initialising worker %u/%u\n", worker, nworkers);
 
 
 		_starpu_init_worker_queue(workerarg);
 		_starpu_init_worker_queue(workerarg);
 
 
@@ -425,7 +494,6 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 		{
 		{
 #if defined(STARPU_USE_CPU) || defined(STARPU_SIMGRID)
 #if defined(STARPU_USE_CPU) || defined(STARPU_SIMGRID)
 			case STARPU_CPU_WORKER:
 			case STARPU_CPU_WORKER:
-				workerarg->set = NULL;
 				driver.id.cpu_id = cpu;
 				driver.id.cpu_id = cpu;
 				if (_starpu_may_launch_driver(pconfig->conf, &driver))
 				if (_starpu_may_launch_driver(pconfig->conf, &driver))
 				{
 				{
@@ -437,6 +505,11 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 						workerarg,
 						workerarg,
 						worker+1);
 						worker+1);
 #ifdef STARPU_USE_FXT
 #ifdef STARPU_USE_FXT
+					/* In tracing mode, make sure the
+					 * thread is really started before
+					 * starting another one, to make sure
+					 * they appear in order in the trace.
+					 */
 					STARPU_PTHREAD_MUTEX_LOCK(&workerarg->mutex);
 					STARPU_PTHREAD_MUTEX_LOCK(&workerarg->mutex);
 					while (!workerarg->worker_is_running)
 					while (!workerarg->worker_is_running)
 						STARPU_PTHREAD_COND_WAIT(&workerarg->started_cond, &workerarg->mutex);
 						STARPU_PTHREAD_COND_WAIT(&workerarg->started_cond, &workerarg->mutex);
@@ -452,7 +525,6 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 #endif
 #endif
 #if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
 #if defined(STARPU_USE_CUDA) || defined(STARPU_SIMGRID)
 			case STARPU_CUDA_WORKER:
 			case STARPU_CUDA_WORKER:
-				workerarg->set = NULL;
 				driver.id.cuda_id = cuda;
 				driver.id.cuda_id = cuda;
 				if (_starpu_may_launch_driver(pconfig->conf, &driver))
 				if (_starpu_may_launch_driver(pconfig->conf, &driver))
 				{
 				{
@@ -487,7 +559,6 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 					break;
 					break;
 				}
 				}
 #endif
 #endif
-				workerarg->set = NULL;
 				STARPU_PTHREAD_CREATE_ON(
 				STARPU_PTHREAD_CREATE_ON(
 					workerarg->name,
 					workerarg->name,
 					&workerarg->worker_thread,
 					&workerarg->worker_thread,
@@ -503,6 +574,77 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 #endif
 #endif
 				break;
 				break;
 #endif
 #endif
+#ifdef STARPU_USE_MIC
+			case STARPU_MIC_WORKER:
+				/* We use the Gordon approach for the MIC,
+				 * which consists in spawning only one thread
+				 * per MIC device, which will control all MIC
+				 * workers of this device. (by using a worker set). */
+				if (mic_worker_set[mp_nodeid].started)
+					goto worker_set_initialized;
+
+				mic_worker_set[mp_nodeid].nworkers = pconfig->topology.nmiccores[mp_nodeid];
+
+				/* We assume all MIC workers of a given MIC
+				 * device are contiguous so that we can
+				 * address them with the first one only. */
+				mic_worker_set[mp_nodeid].workers = workerarg;
+				mic_worker_set[mp_nodeid].set_is_initialized = 0;
+
+				STARPU_PTHREAD_CREATE_ON(
+						workerarg->name,
+						&mic_worker_set[mp_nodeid].worker_thread,
+						NULL,
+						_starpu_mic_src_worker,
+						&mic_worker_set[mp_nodeid],
+						worker+1);
+
+#ifdef STARPU_USE_FXT
+				STARPU_PTHREAD_MUTEX_LOCK(&workerarg->mutex);
+				while (!workerarg->worker_is_running)
+					STARPU_PTHREAD_COND_WAIT(&workerarg->started_cond, &workerarg->mutex);
+				STARPU_PTHREAD_MUTEX_UNLOCK(&workerarg->mutex);
+#endif
+
+				STARPU_PTHREAD_MUTEX_LOCK(&mic_worker_set[mp_nodeid].mutex);
+				while (!mic_worker_set[mp_nodeid].set_is_initialized)
+					STARPU_PTHREAD_COND_WAIT(&mic_worker_set[mp_nodeid].ready_cond,
+								  &mic_worker_set[mp_nodeid].mutex);
+				STARPU_PTHREAD_MUTEX_UNLOCK(&mic_worker_set[mp_nodeid].mutex);
+
+		worker_set_initialized:
+				workerarg->set = &mic_worker_set[mp_nodeid];
+				mic_worker_set[mp_nodeid].started = 1;
+
+#ifdef STARPU_USE_FXT
+				STARPU_PTHREAD_MUTEX_LOCK(&workerarg->mutex);
+				while (!workerarg->worker_is_running)
+					STARPU_PTHREAD_COND_WAIT(&workerarg->started_cond, &workerarg->mutex);
+				STARPU_PTHREAD_MUTEX_UNLOCK(&workerarg->mutex);
+#endif
+
+				break;
+#endif /* STARPU_USE_MIC */
+#ifdef STARPU_USE_SCC
+			case STARPU_SCC_WORKER:
+				workerarg->worker_is_initialized = 0;
+				STARPU_PTHREAD_CREATE_ON(
+						workerarg->name,
+						&workerarg->worker_thread,
+						NULL,
+						_starpu_scc_src_worker,
+						workerarg,
+						worker+1);
+
+#ifdef STARPU_USE_FXT
+				STARPU_PTHREAD_MUTEX_LOCK(&workerarg->mutex);
+				while (!workerarg->worker_is_running)
+					STARPU_PTHREAD_COND_WAIT(&workerarg->started_cond, &workerarg->mutex);
+				STARPU_PTHREAD_MUTEX_UNLOCK(&workerarg->mutex);
+#endif
+				break;
+#endif
+
 			default:
 			default:
 				STARPU_ABORT();
 				STARPU_ABORT();
 		}
 		}
@@ -560,6 +702,17 @@ static void _starpu_launch_drivers(struct _starpu_machine_config *pconfig)
 				STARPU_PTHREAD_MUTEX_UNLOCK(&workerarg->mutex);
 				STARPU_PTHREAD_MUTEX_UNLOCK(&workerarg->mutex);
 				break;
 				break;
 #endif
 #endif
+			case STARPU_MIC_WORKER:
+				/* Already waited above */
+				break;
+			case STARPU_SCC_WORKER:
+				/* TODO: implement may_launch? */
+				_STARPU_DEBUG("waiting for worker %u initialization\n", worker);
+				STARPU_PTHREAD_MUTEX_LOCK(&workerarg->mutex);
+				while (!workerarg->worker_is_initialized)
+					STARPU_PTHREAD_COND_WAIT(&workerarg->ready_cond, &workerarg->mutex);
+				STARPU_PTHREAD_MUTEX_UNLOCK(&workerarg->mutex);
+				break;
 			default:
 			default:
 				STARPU_ABORT();
 				STARPU_ABORT();
 		}
 		}
@@ -598,8 +751,11 @@ int starpu_conf_init(struct starpu_conf *conf)
 		conf->ncpus = starpu_get_env_number("STARPU_NCPUS");
 		conf->ncpus = starpu_get_env_number("STARPU_NCPUS");
 	conf->ncuda = starpu_get_env_number("STARPU_NCUDA");
 	conf->ncuda = starpu_get_env_number("STARPU_NCUDA");
 	conf->nopencl = starpu_get_env_number("STARPU_NOPENCL");
 	conf->nopencl = starpu_get_env_number("STARPU_NOPENCL");
+	conf->nmic = starpu_get_env_number("STARPU_NMIC");
+	conf->nscc = starpu_get_env_number("STARPU_NSCC");
 	conf->calibrate = starpu_get_env_number("STARPU_CALIBRATE");
 	conf->calibrate = starpu_get_env_number("STARPU_CALIBRATE");
 	conf->bus_calibrate = starpu_get_env_number("STARPU_BUS_CALIBRATE");
 	conf->bus_calibrate = starpu_get_env_number("STARPU_BUS_CALIBRATE");
+	conf->mic_sink_program_path = getenv("STARPU_MIC_PROGRAM_PATH");
 
 
 	if (conf->calibrate == -1)
 	if (conf->calibrate == -1)
 	     conf->calibrate = 0;
 	     conf->calibrate = 0;
@@ -610,6 +766,8 @@ int starpu_conf_init(struct starpu_conf *conf)
 	conf->use_explicit_workers_bindid = 0; /* TODO */
 	conf->use_explicit_workers_bindid = 0; /* TODO */
 	conf->use_explicit_workers_cuda_gpuid = 0; /* TODO */
 	conf->use_explicit_workers_cuda_gpuid = 0; /* TODO */
 	conf->use_explicit_workers_opencl_gpuid = 0; /* TODO */
 	conf->use_explicit_workers_opencl_gpuid = 0; /* TODO */
+	conf->use_explicit_workers_mic_deviceid = 0; /* TODO */
+	conf->use_explicit_workers_scc_deviceid = 0; /* TODO */
 
 
 	conf->single_combined_worker = starpu_get_env_number("STARPU_SINGLE_COMBINED_WORKER");
 	conf->single_combined_worker = starpu_get_env_number("STARPU_SINGLE_COMBINED_WORKER");
 	if (conf->single_combined_worker == -1)
 	if (conf->single_combined_worker == -1)
@@ -639,6 +797,14 @@ int starpu_conf_init(struct starpu_conf *conf)
 		conf->disable_asynchronous_opencl_copy = 0;
 		conf->disable_asynchronous_opencl_copy = 0;
 #endif
 #endif
 
 
+#if defined(STARPU_DISABLE_ASYNCHRONOUS_MIC_COPY)
+	conf->disable_asynchronous_mic_copy = 1;
+#else
+	conf->disable_asynchronous_mic_copy = starpu_get_env_number("STARPU_DISABLE_ASYNCHRONOUS_MIC_COPY");
+	if (conf->disable_asynchronous_mic_copy == -1)
+		conf->disable_asynchronous_mic_copy = 0;
+#endif
+
 	/* 64MiB by default */
 	/* 64MiB by default */
 	conf->trace_buffer_size = 64<<20;
 	conf->trace_buffer_size = 64<<20;
 	return 0;
 	return 0;
@@ -672,10 +838,37 @@ void _starpu_conf_check_environment(struct starpu_conf *conf)
 	_starpu_conf_set_value_against_environment("STARPU_DISABLE_ASYNCHRONOUS_COPY", &conf->disable_asynchronous_copy);
 	_starpu_conf_set_value_against_environment("STARPU_DISABLE_ASYNCHRONOUS_COPY", &conf->disable_asynchronous_copy);
 	_starpu_conf_set_value_against_environment("STARPU_DISABLE_ASYNCHRONOUS_CUDA_COPY", &conf->disable_asynchronous_cuda_copy);
 	_starpu_conf_set_value_against_environment("STARPU_DISABLE_ASYNCHRONOUS_CUDA_COPY", &conf->disable_asynchronous_cuda_copy);
 	_starpu_conf_set_value_against_environment("STARPU_DISABLE_ASYNCHRONOUS_OPENCL_COPY", &conf->disable_asynchronous_opencl_copy);
 	_starpu_conf_set_value_against_environment("STARPU_DISABLE_ASYNCHRONOUS_OPENCL_COPY", &conf->disable_asynchronous_opencl_copy);
+	_starpu_conf_set_value_against_environment("STARPU_DISABLE_ASYNCHRONOUS_MIC_COPY", &conf->disable_asynchronous_mic_copy);
 }
 }
 
 
 int starpu_init(struct starpu_conf *user_conf)
 int starpu_init(struct starpu_conf *user_conf)
 {
 {
+	return starpu_initialize(user_conf, NULL, NULL);
+}
+
+int starpu_initialize(struct starpu_conf *user_conf, int *argc, char ***argv)
+{
+	int is_a_sink = 0; /* Always defined. If the MP infrastructure is not
+			    * used, we cannot be a sink. */
+#ifdef STARPU_USE_MP
+	_starpu_set_argc_argv(argc, argv);
+
+#	ifdef STARPU_USE_SCC
+	/* In SCC case we look at the rank to know if we are a sink */
+	if (_starpu_scc_common_mp_init() && !_starpu_scc_common_is_src_node())
+		setenv("STARPU_SINK", "STARPU_SCC", 1);
+#	endif
+
+	/* If StarPU was configured to use MP sinks, we have to control the
+	 * kind on node we are running on : host or sink ? */
+	if (getenv("STARPU_SINK"))
+		is_a_sink = 1;
+#else
+	(void)argc;
+	(void)argv;
+
+#endif /* STARPU_USE_MP */
+
 	int ret;
 	int ret;
 
 
 #ifndef STARPU_SIMGRID
 #ifndef STARPU_SIMGRID
@@ -783,11 +976,17 @@ int starpu_init(struct starpu_conf *user_conf)
 
 
 	_starpu_load_bus_performance_files();
 	_starpu_load_bus_performance_files();
 
 
-	ret = _starpu_build_topology(&config);
+	/* Depending on whether we are a MP sink or not, we must build the
+	 * topology with MP nodes or not. */
+	ret = _starpu_build_topology(&config, is_a_sink ? 1 : 0);
 	if (ret)
 	if (ret)
 	{
 	{
 		STARPU_PTHREAD_MUTEX_LOCK(&init_mutex);
 		STARPU_PTHREAD_MUTEX_LOCK(&init_mutex);
 		init_count--;
 		init_count--;
+#ifdef STARPU_USE_SCC
+		if (_starpu_scc_common_is_mp_initialized())
+			_starpu_scc_src_mp_deinit();
+#endif
 		initialized = UNINITIALIZED;
 		initialized = UNINITIALIZED;
 		/* Let somebody else try to do it */
 		/* Let somebody else try to do it */
 		STARPU_PTHREAD_COND_SIGNAL(&init_cond);
 		STARPU_PTHREAD_COND_SIGNAL(&init_cond);
@@ -799,12 +998,14 @@ int starpu_init(struct starpu_conf *user_conf)
 	 * threads */
 	 * threads */
 	_starpu_initialize_current_task_key();
 	_starpu_initialize_current_task_key();
 
 
-	_starpu_create_sched_ctx(config.conf->sched_policy_name, NULL, -1, 1, "init");
+	if (!is_a_sink)
+		_starpu_create_sched_ctx(config.conf->sched_policy_name, NULL, -1, 1, "init");
 
 
 	_starpu_initialize_registered_performance_models();
 	_starpu_initialize_registered_performance_models();
 
 
 	/* Launch "basic" workers (ie. non-combined workers) */
 	/* Launch "basic" workers (ie. non-combined workers) */
-	_starpu_launch_drivers(&config);
+	if (!is_a_sink)
+		_starpu_launch_drivers(&config);
 
 
 	STARPU_PTHREAD_MUTEX_LOCK(&init_mutex);
 	STARPU_PTHREAD_MUTEX_LOCK(&init_mutex);
 	initialized = INITIALIZED;
 	initialized = INITIALIZED;
@@ -813,6 +1014,20 @@ int starpu_init(struct starpu_conf *user_conf)
 	STARPU_PTHREAD_MUTEX_UNLOCK(&init_mutex);
 	STARPU_PTHREAD_MUTEX_UNLOCK(&init_mutex);
 
 
 	_STARPU_DEBUG("Initialisation finished\n");
 	_STARPU_DEBUG("Initialisation finished\n");
+
+#ifdef STARPU_USE_MP
+	/* Finally, if we are a MP sink, we never leave this function. Else,
+	 * we enter an infinite event loop which listen for MP commands from
+	 * the source. */
+	if (is_a_sink) {
+		_starpu_sink_common_worker();
+
+		/* We should normally never leave the loop as we don't want to
+		 * really initialize STARPU */
+		STARPU_ASSERT(0);
+	}
+#endif
+
 	return 0;
 	return 0;
 }
 }
 
 
@@ -843,7 +1058,7 @@ static void _starpu_terminate_workers(struct _starpu_machine_config *pconfig)
  		 * we have to check if pthread_self() is the worker itself */
  		 * we have to check if pthread_self() is the worker itself */
 		if (set)
 		if (set)
 		{
 		{
-			if (!set->joined)
+			if (set->started)
 			{
 			{
 #ifdef STARPU_SIMGRID
 #ifdef STARPU_SIMGRID
 				status = starpu_pthread_join(set->worker_thread, NULL);
 				status = starpu_pthread_join(set->worker_thread, NULL);
@@ -857,7 +1072,7 @@ static void _starpu_terminate_workers(struct _starpu_machine_config *pconfig)
 					_STARPU_DEBUG("starpu_pthread_join -> %d\n", status);
 					_STARPU_DEBUG("starpu_pthread_join -> %d\n", status);
 				}
 				}
 #endif
 #endif
-				set->joined = 1;
+				set->started = 0;
 			}
 			}
 		}
 		}
 		else
 		else
@@ -1012,6 +1227,11 @@ void starpu_shutdown(void)
 	if (AYU_event) AYU_event(AYU_FINISH, 0, NULL);
 	if (AYU_event) AYU_event(AYU_FINISH, 0, NULL);
 #endif
 #endif
 
 
+#ifdef STARPU_USE_SCC
+	if (_starpu_scc_common_is_mp_initialized())
+		_starpu_scc_src_mp_deinit();
+#endif
+
 	_STARPU_DEBUG("Shutdown finished\n");
 	_STARPU_DEBUG("Shutdown finished\n");
 }
 }
 
 
@@ -1033,6 +1253,12 @@ int starpu_worker_get_count_by_type(enum starpu_worker_archtype type)
 		case STARPU_OPENCL_WORKER:
 		case STARPU_OPENCL_WORKER:
 			return config.topology.nopenclgpus;
 			return config.topology.nopenclgpus;
 
 
+		case STARPU_MIC_WORKER:
+			return config.topology.nmicdevices;
+
+		case STARPU_SCC_WORKER:
+			return config.topology.nsccdevices;
+
 		default:
 		default:
 			return -EINVAL;
 			return -EINVAL;
 	}
 	}
@@ -1073,6 +1299,26 @@ int starpu_asynchronous_opencl_copy_disabled(void)
 	return config.conf->disable_asynchronous_opencl_copy;
 	return config.conf->disable_asynchronous_opencl_copy;
 }
 }
 
 
+int starpu_asynchronous_mic_copy_disabled(void)
+{
+	return config.conf->disable_asynchronous_mic_copy;
+}
+
+unsigned starpu_mic_worker_get_count(void)
+{
+	int i = 0, count = 0;
+	
+	for (i = 0; i < STARPU_MAXMICDEVS; i++)
+		count += config.topology.nmiccores[i];
+	
+	return count;
+}
+
+unsigned starpu_scc_worker_get_count(void)
+{
+	return config.topology.nsccdevices;
+}
+
 /* When analyzing performance, it is useful to see what is the processing unit
 /* When analyzing performance, it is useful to see what is the processing unit
  * that actually performed the task. This function returns the id of the
  * that actually performed the task. This function returns the id of the
  * processing unit actually executing it, therefore it makes no sense to use it
  * processing unit actually executing it, therefore it makes no sense to use it
@@ -1146,6 +1392,11 @@ int starpu_combined_worker_get_rank(void)
 	}
 	}
 }
 }
 
 
+int starpu_worker_get_mp_nodeid(int id)
+{
+	return config.workers[id].mp_nodeid;
+}
+
 int starpu_worker_get_devid(int id)
 int starpu_worker_get_devid(int id)
 {
 {
 	return config.workers[id].devid;
 	return config.workers[id].devid;

+ 26 - 1
src/core/workers.h

@@ -37,6 +37,15 @@
 #include <drivers/cuda/driver_cuda.h>
 #include <drivers/cuda/driver_cuda.h>
 #include <drivers/opencl/driver_opencl.h>
 #include <drivers/opencl/driver_opencl.h>
 
 
+#ifdef STARPU_USE_MIC
+#include <drivers/mic/driver_mic_source.h>
+#endif /* STARPU_USE_MIC */
+
+#ifdef STARPU_USE_SCC
+#include <drivers/scc/driver_scc_source.h>
+#endif
+
+
 #include <drivers/cpu/driver_cpu.h>
 #include <drivers/cpu/driver_cpu.h>
 
 
 #include <datawizard/datawizard.h>
 #include <datawizard/datawizard.h>
@@ -51,6 +60,8 @@ struct _starpu_worker
 	uint32_t worker_mask; /* what is the type of worker ? */
 	uint32_t worker_mask; /* what is the type of worker ? */
 	enum starpu_perfmodel_archtype perf_arch; /* in case there are different models of the same arch */
 	enum starpu_perfmodel_archtype perf_arch; /* in case there are different models of the same arch */
 	starpu_pthread_t worker_thread; /* the thread which runs the worker */
 	starpu_pthread_t worker_thread; /* the thread which runs the worker */
+	int mp_nodeid; /* which mp node hold the cpu/gpu/etc (-1 for this
+			* node) */
 	unsigned devid; /* which cpu/gpu/etc is controlled by the worker ? */
 	unsigned devid; /* which cpu/gpu/etc is controlled by the worker ? */
 	int bindid; /* which cpu is the driver bound to ? (logical index) */
 	int bindid; /* which cpu is the driver bound to ? (logical index) */
 	int workerid; /* uniquely identify the worker among all processing units types */
 	int workerid; /* uniquely identify the worker among all processing units types */
@@ -127,7 +138,7 @@ struct _starpu_worker_set
         starpu_pthread_mutex_t mutex;
         starpu_pthread_mutex_t mutex;
 	starpu_pthread_t worker_thread; /* the thread which runs the worker */
 	starpu_pthread_t worker_thread; /* the thread which runs the worker */
 	unsigned nworkers;
 	unsigned nworkers;
-	unsigned joined; /* only one thread may call pthread_join*/
+	unsigned started; /* Only one thread for the whole set */
 	void *retval;
 	void *retval;
 	struct _starpu_worker *workers;
 	struct _starpu_worker *workers;
         starpu_pthread_cond_t ready_cond; /* indicate when the set is ready */
         starpu_pthread_cond_t ready_cond; /* indicate when the set is ready */
@@ -151,6 +162,12 @@ struct _starpu_machine_config
 	/* Which GPU(s) do we use for OpenCL ? */
 	/* Which GPU(s) do we use for OpenCL ? */
 	int current_opencl_gpuid;
 	int current_opencl_gpuid;
 
 
+	/* Which MIC do we use? */
+	int current_mic_deviceid;
+
+	/* Which SCC do we use? */
+	int current_scc_deviceid;
+
 	/* Basic workers : each of this worker is running its own driver and
 	/* Basic workers : each of this worker is running its own driver and
 	 * can be combined with other basic workers. */
 	 * can be combined with other basic workers. */
 	struct _starpu_worker workers[STARPU_NMAXWORKERS];
 	struct _starpu_worker workers[STARPU_NMAXWORKERS];
@@ -181,6 +198,11 @@ struct _starpu_machine_config
 	unsigned submitting;
 	unsigned submitting;
 };
 };
 
 
+/* Three functions to manage argv, argc */
+void _starpu_set_argc_argv(int *argc, char ***argv);
+int *_starpu_get_argc();
+char ***_starpu_get_argv();
+
 /* Fill conf with environment variables */
 /* Fill conf with environment variables */
 void _starpu_conf_check_environment(struct starpu_conf *conf);
 void _starpu_conf_check_environment(struct starpu_conf *conf);
 
 
@@ -199,6 +221,9 @@ uint32_t _starpu_can_submit_cpu_task(void);
 /* Is there a worker that can execute OpenCL code ? */
 /* Is there a worker that can execute OpenCL code ? */
 uint32_t _starpu_can_submit_opencl_task(void);
 uint32_t _starpu_can_submit_opencl_task(void);
 
 
+/* Is there a worker that can execute OpenCL code ? */
+uint32_t _starpu_can_submit_scc_task(void);
+
 /* Check whether there is anything that the worker should do instead of
 /* Check whether there is anything that the worker should do instead of
  * sleeping (waiting on something to happen). */
  * sleeping (waiting on something to happen). */
 unsigned _starpu_worker_can_block(unsigned memnode);
 unsigned _starpu_worker_can_block(unsigned memnode);

+ 8 - 0
src/datawizard/coherency.c

@@ -180,6 +180,11 @@ static int worker_supports_direct_access(unsigned node, unsigned handling_node)
 #endif
 #endif
 		case STARPU_OPENCL_RAM:
 		case STARPU_OPENCL_RAM:
 			return 0;
 			return 0;
+		case STARPU_MIC_RAM:
+			/* We don't handle direct MIC-MIC transfers yet */
+			return 0;
+		case STARPU_SCC_RAM:
+			return 1;
 		default:
 		default:
 			return 1;
 			return 1;
 	}
 	}
@@ -346,6 +351,9 @@ struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_ha
 								  unsigned async,
 								  unsigned async,
 								  void (*callback_func)(void *), void *callback_arg)
 								  void (*callback_func)(void *), void *callback_arg)
 {
 {
+	/* We don't care about commuting for data requests, that was handled before. */
+	mode &= ~STARPU_COMMUTE;
+
 	/* This function is called with handle's header lock taken */
 	/* This function is called with handle's header lock taken */
 	_starpu_spin_checklocked(&handle->header_lock);
 	_starpu_spin_checklocked(&handle->header_lock);
 
 

+ 10 - 7
src/datawizard/coherency.h

@@ -1,6 +1,6 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
- * Copyright (C) 2009-2012  Université de Bordeaux 1
+ * Copyright (C) 2009-2013  Université de Bordeaux 1
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  * Copyright (C) 2010, 2011, 2012, 2013  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
@@ -106,6 +106,9 @@ struct _starpu_data_state
 	 * the req_list anymore), i.e. the number of holders of the
 	 * the req_list anymore), i.e. the number of holders of the
 	 * current_mode rwlock */
 	 * current_mode rwlock */
 	unsigned refcnt;
 	unsigned refcnt;
+	/* Current access mode. Is always either STARPU_R, STARPU_W,
+	 * STARPU_SCRATCH or STARPU_REDUX, but never a combination such as
+	 * STARPU_RW. */
 	enum starpu_data_access_mode current_mode;
 	enum starpu_data_access_mode current_mode;
 	/* protect meta data */
 	/* protect meta data */
 	struct _starpu_spinlock header_lock;
 	struct _starpu_spinlock header_lock;
@@ -138,7 +141,7 @@ struct _starpu_data_state
 	/* Footprint which identifies data layout */
 	/* Footprint which identifies data layout */
 	uint32_t footprint;
 	uint32_t footprint;
 
 
-	/* where is the data home ? -1 if none yet */
+	/* where is the data home, i.e. which node it was registered from ? -1 if none yet */
 	int home_node;
 	int home_node;
 
 
 	/* what is the default write-through mask for that data ? */
 	/* what is the default write-through mask for that data ? */
@@ -163,8 +166,8 @@ struct _starpu_data_state
 	 * read-only mode should depend on that task implicitely if the
 	 * read-only mode should depend on that task implicitely if the
 	 * sequential_consistency flag is enabled. */
 	 * sequential_consistency flag is enabled. */
 	enum starpu_data_access_mode last_submitted_mode;
 	enum starpu_data_access_mode last_submitted_mode;
-	struct starpu_task *last_submitted_writer;
-	struct _starpu_task_wrapper_list *last_submitted_readers;
+	struct starpu_task *last_sync_task;
+	struct _starpu_task_wrapper_list *last_submitted_accessors;
 
 
 	/* If FxT is enabled, we keep track of "ghost dependencies": that is to
 	/* If FxT is enabled, we keep track of "ghost dependencies": that is to
 	 * say the dependencies that are not needed anymore, but that should
 	 * say the dependencies that are not needed anymore, but that should
@@ -172,9 +175,9 @@ struct _starpu_data_state
 	 * f(Aw) g(Aw), and that g is submitted after the termination of f, we
 	 * f(Aw) g(Aw), and that g is submitted after the termination of f, we
 	 * want to have f->g appear in the DAG even if StarPU does not need to
 	 * want to have f->g appear in the DAG even if StarPU does not need to
 	 * enforce this dependency anymore.*/
 	 * enforce this dependency anymore.*/
-	unsigned last_submitted_ghost_writer_id_is_valid;
-	unsigned long last_submitted_ghost_writer_id;
-	struct _starpu_jobid_list *last_submitted_ghost_readers_id;
+	unsigned last_submitted_ghost_sync_id_is_valid;
+	unsigned long last_submitted_ghost_sync_id;
+	struct _starpu_jobid_list *last_submitted_ghost_accessors_id;
 
 
 	struct _starpu_task_wrapper_list *post_sync_tasks;
 	struct _starpu_task_wrapper_list *post_sync_tasks;
 	unsigned post_sync_tasks_cnt;
 	unsigned post_sync_tasks_cnt;

+ 128 - 0
src/datawizard/copy_driver.c

@@ -320,6 +320,83 @@ static int copy_data_1_to_1_generic(starpu_data_handle_t handle,
 		}
 		}
 		break;
 		break;
 #endif
 #endif
+#ifdef STARPU_USE_MIC
+	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_MIC_RAM):
+		/* RAM -> MIC */
+		if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_mic_copy_disabled() ||
+				!(copy_methods->ram_to_mic_async || copy_methods->any_to_any))
+		{
+			/* this is not associated to a request so it's synchronous */
+			STARPU_ASSERT(copy_methods->ram_to_mic || copy_methods->any_to_any);
+			if (copy_methods->ram_to_mic)
+				copy_methods->ram_to_mic(src_interface, src_node, dst_interface, dst_node);
+			else
+				copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
+		}
+		else
+		{
+			req->async_channel.type = STARPU_MIC_RAM;
+			if (copy_methods->ram_to_mic_async)
+				ret = copy_methods->ram_to_mic_async(src_interface, src_node, dst_interface, dst_node);
+			else
+			{
+				STARPU_ASSERT(copy_methods->any_to_any);
+				ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
+			}
+			_starpu_mic_init_event(&(req->async_channel.event.mic_event), dst_node);
+		}
+		break;
+	case _STARPU_MEMORY_NODE_TUPLE(STARPU_MIC_RAM,STARPU_CPU_RAM):
+		/* MIC -> RAM */
+		if (!req || starpu_asynchronous_copy_disabled() || starpu_asynchronous_mic_copy_disabled() ||
+				!(copy_methods->mic_to_ram_async || copy_methods->any_to_any))
+		{
+			/* this is not associated to a request so it's synchronous */
+			STARPU_ASSERT(copy_methods->mic_to_ram || copy_methods->any_to_any);
+			if (copy_methods->mic_to_ram)
+				copy_methods->mic_to_ram(src_interface, src_node, dst_interface, dst_node);
+			else
+				copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
+		}
+		else
+		{
+			req->async_channel.type = STARPU_MIC_RAM;
+			if (copy_methods->mic_to_ram_async)
+				ret = copy_methods->mic_to_ram_async(src_interface, src_node, dst_interface, dst_node);
+			else
+			{
+				STARPU_ASSERT(copy_methods->any_to_any);
+				ret = copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, &req->async_channel);
+			}
+			_starpu_mic_init_event(&(req->async_channel.event.mic_event), src_node);
+		}
+		break;
+#endif
+#ifdef STARPU_USE_SCC
+		/* SCC RAM associated to the master process is considered as
+		 * the main memory node. */
+	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_SCC_RAM):
+		/* master private SCC RAM -> slave private SCC RAM */
+		if (copy_methods->scc_src_to_sink)
+			copy_methods->scc_src_to_sink(src_interface, src_node, dst_interface, dst_node);
+		else
+			copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
+		break;
+	case _STARPU_MEMORY_NODE_TUPLE(STARPU_SCC_RAM,STARPU_CPU_RAM):
+		/* slave private SCC RAM -> master private SCC RAM */
+		if (copy_methods->scc_sink_to_src)
+			copy_methods->scc_sink_to_src(src_interface, src_node, dst_interface, dst_node);
+		else
+			copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
+		break;
+	case _STARPU_MEMORY_NODE_TUPLE(STARPU_SCC_RAM,STARPU_SCC_RAM):
+		/* slave private SCC RAM -> slave private SCC RAM */
+		if (copy_methods->scc_sink_to_sink)
+			copy_methods->scc_sink_to_sink(src_interface, src_node, dst_interface, dst_node);
+		else
+			copy_methods->any_to_any(src_interface, src_node, dst_interface, dst_node, NULL);
+		break;
+#endif
 	default:
 	default:
 		STARPU_ABORT();
 		STARPU_ABORT();
 		break;
 		break;
@@ -438,6 +515,47 @@ int starpu_interface_copy(uintptr_t src, size_t src_offset, unsigned src_node, u
 				size,
 				size,
 				&async_channel->event.opencl_event);
 				&async_channel->event.opencl_event);
 #endif
 #endif
+#ifdef STARPU_USE_MIC
+	case _STARPU_MEMORY_NODE_TUPLE(STARPU_MIC_RAM,STARPU_CPU_RAM):
+		if (async_data)
+			return _starpu_mic_copy_mic_to_ram_async(
+					(void*) src + src_offset, src_node,
+					(void*) dst + dst_offset, dst_node,
+					size);
+		else
+			return _starpu_mic_copy_mic_to_ram(
+					(void*) src + src_offset, src_node,
+					(void*) dst + dst_offset, dst_node,
+					size);
+	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_MIC_RAM):
+		if (async_data)
+			return _starpu_mic_copy_ram_to_mic_async(
+					(void*) src + src_offset, src_node,
+					(void*) dst + dst_offset, dst_node,
+					size);
+		else
+			return _starpu_mic_copy_ram_to_mic(
+					(void*) src + src_offset, src_node,
+					(void*) dst + dst_offset, dst_node,
+					size);
+#endif
+#ifdef STARPU_USE_SCC
+	case _STARPU_MEMORY_NODE_TUPLE(STARPU_SCC_RAM,STARPU_CPU_RAM):
+		_starpu_scc_copy_sink_to_src(
+				(void*) src + src_offset, src_node,
+				(void*) dst + dst_offset, dst_node,
+				size);
+	case _STARPU_MEMORY_NODE_TUPLE(STARPU_CPU_RAM,STARPU_SCC_RAM):
+		_starpu_scc_copy_src_to_sink(
+				(void*) src + src_offset, src_node,
+				(void*) dst + dst_offset, dst_node,
+				size);
+	case _STARPU_MEMORY_NODE_TUPLE(STARPU_SCC_RAM,STARPU_SCC_RAM):
+		_starpu_scc_copy_sink_to_sink(
+				(void*) src + src_offset, src_node,
+				(void*) dst + dst_offset, dst_node,
+				size);
+#endif
 	default:
 	default:
 		STARPU_ABORT();
 		STARPU_ABORT();
 		return -1;
 		return -1;
@@ -490,6 +608,11 @@ void _starpu_driver_wait_request_completion(struct _starpu_async_channel *async_
 	      break;
 	      break;
 	}
 	}
 #endif
 #endif
+#ifdef STARPU_USE_MIC
+	case STARPU_MIC_RAM:
+		_starpu_mic_wait_request_completion(&(async_channel->event.mic_event));
+		break;
+#endif
 	case STARPU_CPU_RAM:
 	case STARPU_CPU_RAM:
 	default:
 	default:
 		STARPU_ABORT();
 		STARPU_ABORT();
@@ -541,6 +664,11 @@ unsigned _starpu_driver_test_request_completion(struct _starpu_async_channel *as
 		break;
 		break;
 	}
 	}
 #endif
 #endif
+#ifdef STARPU_USE_MIC
+	case STARPU_MIC_RAM:
+		success = _starpu_mic_request_is_complete(&(async_channel->event.mic_event));
+		break;
+#endif
 	case STARPU_CPU_RAM:
 	case STARPU_CPU_RAM:
 	default:
 	default:
 		STARPU_ABORT();
 		STARPU_ABORT();

+ 15 - 0
src/datawizard/copy_driver.h

@@ -36,6 +36,18 @@
 struct _starpu_data_request;
 struct _starpu_data_request;
 struct _starpu_data_replicate;
 struct _starpu_data_replicate;
 
 
+#ifdef STARPU_USE_MIC
+/* MIC need memory_node to now which MIC is concerned.
+ * mark is used to wait asynchronous request.
+ * signal is used to test asynchronous request. */
+struct _starpu_mic_async_event
+{
+	unsigned memory_node;
+	int mark;
+	uint64_t *signal;
+};
+#endif
+
 /* this is a structure that can be queried to see whether an asynchronous
 /* this is a structure that can be queried to see whether an asynchronous
  * transfer has terminated or not */
  * transfer has terminated or not */
 union _starpu_async_channel_event
 union _starpu_async_channel_event
@@ -54,6 +66,9 @@ union _starpu_async_channel_event
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
         cl_event opencl_event;
         cl_event opencl_event;
 #endif
 #endif
+#ifdef STARPU_USE_MIC
+	struct _starpu_mic_async_event mic_event;
+#endif
 };
 };
 
 
 struct _starpu_async_channel
 struct _starpu_async_channel

+ 4 - 1
src/datawizard/data_request.h

@@ -43,7 +43,10 @@ LIST_TYPE(_starpu_data_request,
 	struct _starpu_data_replicate *src_replicate;
 	struct _starpu_data_replicate *src_replicate;
 	struct _starpu_data_replicate *dst_replicate;
 	struct _starpu_data_replicate *dst_replicate;
 
 
-	/* Which memory node will actually perform the transfer */
+	/* Which memory node will actually perform the transfer.
+	 * This is important in the CUDA/OpenCL case, where only the worker for
+	 * the node can make the CUDA/OpenCL calls.
+	 */
 	unsigned handling_node;
 	unsigned handling_node;
 
 
 	/*
 	/*

+ 5 - 5
src/datawizard/filters.c

@@ -184,8 +184,8 @@ void starpu_data_partition(starpu_data_handle_t initial_handle, struct starpu_da
 
 
 		STARPU_PTHREAD_MUTEX_INIT(&child->sequential_consistency_mutex, NULL);
 		STARPU_PTHREAD_MUTEX_INIT(&child->sequential_consistency_mutex, NULL);
 		child->last_submitted_mode = STARPU_R;
 		child->last_submitted_mode = STARPU_R;
-		child->last_submitted_writer = NULL;
-		child->last_submitted_readers = NULL;
+		child->last_sync_task = NULL;
+		child->last_submitted_accessors = NULL;
 		child->post_sync_tasks = NULL;
 		child->post_sync_tasks = NULL;
 		child->post_sync_tasks_cnt = 0;
 		child->post_sync_tasks_cnt = 0;
 
 
@@ -195,9 +195,9 @@ void starpu_data_partition(starpu_data_handle_t initial_handle, struct starpu_da
 		child->init_cl = initial_handle->init_cl;
 		child->init_cl = initial_handle->init_cl;
 
 
 #ifdef STARPU_USE_FXT
 #ifdef STARPU_USE_FXT
-		child->last_submitted_ghost_writer_id_is_valid = 0;
-		child->last_submitted_ghost_writer_id = 0;
-		child->last_submitted_ghost_readers_id = NULL;
+		child->last_submitted_ghost_sync_id_is_valid = 0;
+		child->last_submitted_ghost_sync_id = 0;
+		child->last_submitted_ghost_accessors_id = NULL;
 #endif
 #endif
 
 
 		for (node = 0; node < STARPU_MAXNODES; node++)
 		for (node = 0; node < STARPU_MAXNODES; node++)

+ 1 - 0
src/datawizard/interfaces/bcsr_filters.c

@@ -35,6 +35,7 @@ void starpu_bcsr_filter_canonical_block(void *father_interface, void *child_inte
 
 
 	uint32_t ptr_offset = c*r*id*elemsize;
 	uint32_t ptr_offset = c*r*id*elemsize;
 
 
+	matrix_child->id = STARPU_MATRIX_INTERFACE_ID;
 	matrix_child->nx = c;
 	matrix_child->nx = c;
 	matrix_child->ny = r;
 	matrix_child->ny = r;
 	matrix_child->ld = c;
 	matrix_child->ld = c;

+ 4 - 2
src/datawizard/interfaces/bcsr_interface.c

@@ -46,7 +46,7 @@ static int bcsr_compare(void *data_interface_a, void *data_interface_b);
 static uint32_t footprint_bcsr_interface_crc32(starpu_data_handle_t handle);
 static uint32_t footprint_bcsr_interface_crc32(starpu_data_handle_t handle);
 
 
 
 
-static struct starpu_data_interface_ops interface_bcsr_ops =
+struct starpu_data_interface_ops starpu_interface_bcsr_ops =
 {
 {
 	.register_data_handle = register_bcsr_handle,
 	.register_data_handle = register_bcsr_handle,
 	.allocate_data_on_node = allocate_bcsr_buffer_on_node,
 	.allocate_data_on_node = allocate_bcsr_buffer_on_node,
@@ -82,6 +82,7 @@ static void register_bcsr_handle(starpu_data_handle_t handle, unsigned home_node
 			local_interface->rowptr = NULL;
 			local_interface->rowptr = NULL;
 		}
 		}
 
 
+		local_interface->id = bcsr_interface->id;
 		local_interface->nnz = bcsr_interface->nnz;
 		local_interface->nnz = bcsr_interface->nnz;
 		local_interface->nrow = bcsr_interface->nrow;
 		local_interface->nrow = bcsr_interface->nrow;
 		local_interface->firstentry = bcsr_interface->firstentry;
 		local_interface->firstentry = bcsr_interface->firstentry;
@@ -98,6 +99,7 @@ void starpu_bcsr_data_register(starpu_data_handle_t *handleptr, unsigned home_no
 {
 {
 	struct starpu_bcsr_interface bcsr_interface =
 	struct starpu_bcsr_interface bcsr_interface =
 	{
 	{
+		.id = STARPU_BCSR_INTERFACE_ID,
 		.nzval = nzval,
 		.nzval = nzval,
 		.colind = colind,
 		.colind = colind,
 		.rowptr = rowptr,
 		.rowptr = rowptr,
@@ -109,7 +111,7 @@ void starpu_bcsr_data_register(starpu_data_handle_t *handleptr, unsigned home_no
 		.elemsize = elemsize
 		.elemsize = elemsize
 	};
 	};
 
 
-	starpu_data_register(handleptr, home_node, &bcsr_interface, &interface_bcsr_ops);
+	starpu_data_register(handleptr, home_node, &bcsr_interface, &starpu_interface_bcsr_ops);
 }
 }
 
 
 static uint32_t footprint_bcsr_interface_crc32(starpu_data_handle_t handle)
 static uint32_t footprint_bcsr_interface_crc32(starpu_data_handle_t handle)

+ 1 - 0
src/datawizard/interfaces/block_filters.c

@@ -37,6 +37,7 @@ void starpu_block_filter_block(void *father_interface, void *child_interface, ST
 	_starpu_filter_nparts_compute_chunk_size_and_offset(nx, nparts, elemsize, id, 1,
 	_starpu_filter_nparts_compute_chunk_size_and_offset(nx, nparts, elemsize, id, 1,
 				       &chunk_size, &offset);
 				       &chunk_size, &offset);
 
 
+	block_child->id = block_father->id;
 	block_child->nx = chunk_size;
 	block_child->nx = chunk_size;
 	block_child->ny = ny;
 	block_child->ny = ny;
 	block_child->nz = nz;
 	block_child->nz = nz;

+ 197 - 2
src/datawizard/interfaces/block_interface.c

@@ -26,6 +26,8 @@
 #include <starpu_cuda.h>
 #include <starpu_cuda.h>
 #include <starpu_opencl.h>
 #include <starpu_opencl.h>
 #include <drivers/opencl/driver_opencl.h>
 #include <drivers/opencl/driver_opencl.h>
+#include <drivers/scc/driver_scc_source.h>
+#include <drivers/mic/driver_mic_source.h>
 
 
 static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
 static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED);
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
@@ -43,6 +45,17 @@ static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARP
 static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cl_event *event);
 static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cl_event *event);
 static int copy_opencl_to_opencl_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cl_event *event);
 static int copy_opencl_to_opencl_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cl_event *event);
 #endif
 #endif
+#ifdef STARPU_USE_SCC
+static int copy_scc_src_to_sink(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
+static int copy_scc_sink_to_src(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
+static int copy_scc_sink_to_sink(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
+#endif
+#ifdef STARPU_USE_MIC
+static int copy_ram_to_mic(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
+static int copy_mic_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
+static int copy_ram_to_mic_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
+static int copy_mic_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
+#endif
 
 
 static const struct starpu_data_copy_methods block_copy_data_methods_s =
 static const struct starpu_data_copy_methods block_copy_data_methods_s =
 {
 {
@@ -62,6 +75,17 @@ static const struct starpu_data_copy_methods block_copy_data_methods_s =
 	.opencl_to_ram_async = copy_opencl_to_ram_async,
 	.opencl_to_ram_async = copy_opencl_to_ram_async,
 	.opencl_to_opencl_async = copy_opencl_to_opencl_async,
 	.opencl_to_opencl_async = copy_opencl_to_opencl_async,
 #endif
 #endif
+#ifdef STARPU_USE_SCC
+	.scc_src_to_sink = copy_scc_src_to_sink,
+	.scc_sink_to_src = copy_scc_sink_to_src,
+	.scc_sink_to_sink = copy_scc_sink_to_sink,
+#endif
+#ifdef STARPU_USE_MIC
+	.ram_to_mic = copy_ram_to_mic,
+	.mic_to_ram = copy_mic_to_ram,
+	.ram_to_mic_async = copy_ram_to_mic_async,
+	.mic_to_ram_async = copy_mic_to_ram_async,
+#endif
 };
 };
 
 
 
 
@@ -74,7 +98,7 @@ static uint32_t footprint_block_interface_crc32(starpu_data_handle_t handle);
 static int block_compare(void *data_interface_a, void *data_interface_b);
 static int block_compare(void *data_interface_a, void *data_interface_b);
 static void display_block_interface(starpu_data_handle_t handle, FILE *f);
 static void display_block_interface(starpu_data_handle_t handle, FILE *f);
 
 
-static struct starpu_data_interface_ops interface_block_ops =
+struct starpu_data_interface_ops starpu_interface_block_ops =
 {
 {
 	.register_data_handle = register_block_handle,
 	.register_data_handle = register_block_handle,
 	.allocate_data_on_node = allocate_block_buffer_on_node,
 	.allocate_data_on_node = allocate_block_buffer_on_node,
@@ -126,6 +150,7 @@ static void register_block_handle(starpu_data_handle_t handle, unsigned home_nod
 			local_interface->ldz  = 0;
 			local_interface->ldz  = 0;
 		}
 		}
 
 
+		local_interface->id = block_interface->id;
 		local_interface->nx = block_interface->nx;
 		local_interface->nx = block_interface->nx;
 		local_interface->ny = block_interface->ny;
 		local_interface->ny = block_interface->ny;
 		local_interface->nz = block_interface->nz;
 		local_interface->nz = block_interface->nz;
@@ -140,6 +165,7 @@ void starpu_block_data_register(starpu_data_handle_t *handleptr, unsigned home_n
 {
 {
 	struct starpu_block_interface block_interface =
 	struct starpu_block_interface block_interface =
 	{
 	{
+		.id = STARPU_BLOCK_INTERFACE_ID,
 		.ptr = ptr,
 		.ptr = ptr,
                 .dev_handle = ptr,
                 .dev_handle = ptr,
                 .offset = 0,
                 .offset = 0,
@@ -151,7 +177,12 @@ void starpu_block_data_register(starpu_data_handle_t *handleptr, unsigned home_n
 		.elemsize = elemsize
 		.elemsize = elemsize
 	};
 	};
 
 
-	starpu_data_register(handleptr, home_node, &block_interface, &interface_block_ops);
+#ifdef STARPU_USE_SCC
+	_starpu_scc_set_offset_in_shared_memory((void*)block_interface.ptr,
+			(void**)&(block_interface.dev_handle), &(block_interface.offset));
+#endif
+
+	starpu_data_register(handleptr, home_node, &block_interface, &starpu_interface_block_ops);
 }
 }
 
 
 static uint32_t footprint_block_interface_crc32(starpu_data_handle_t handle)
 static uint32_t footprint_block_interface_crc32(starpu_data_handle_t handle)
@@ -584,6 +615,170 @@ static int copy_opencl_to_opencl(void *src_interface, unsigned src_node STARPU_A
 
 
 #endif
 #endif
 
 
+#ifdef STARPU_USE_SCC
+static int copy_scc_src_to_sink(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
+{
+	uint32_t nx = STARPU_BLOCK_GET_NX(dst_interface);
+	uint32_t ny = STARPU_BLOCK_GET_NY(dst_interface);
+	uint32_t nz = STARPU_BLOCK_GET_NZ(dst_interface);
+
+	size_t elemsize = STARPU_BLOCK_GET_ELEMSIZE(dst_interface);
+
+	uint32_t src_ldy = STARPU_BLOCK_GET_LDY(src_interface);
+	uint32_t src_ldz = STARPU_BLOCK_GET_LDZ(src_interface);
+	uint32_t dst_ldy = STARPU_BLOCK_GET_LDY(dst_interface);
+	uint32_t dst_ldz = STARPU_BLOCK_GET_LDZ(dst_interface);
+
+	void *src_ptr = (void *)STARPU_BLOCK_GET_PTR(src_interface);
+	void *dst_ptr = (void *)STARPU_BLOCK_GET_PTR(dst_interface);
+
+	unsigned y, z;
+	for (z = 0; z < nz; ++z)
+	{
+		for (y = 0; y < ny; ++y)
+		{
+			uint32_t src_offset = (y*src_ldy + z*src_ldz) * elemsize;
+			uint32_t dst_offset = (y*dst_ldy + z*dst_ldz) * elemsize;
+
+			_starpu_scc_copy_src_to_sink(src_ptr + src_offset, src_node,
+							dst_ptr + dst_offset, dst_node, nx*elemsize);
+		}
+	}
+
+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, nx*ny*nz*elemsize);
+
+	return 0;
+}
+
+static int copy_scc_sink_to_src(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
+{
+	uint32_t nx = STARPU_BLOCK_GET_NX(dst_interface);
+	uint32_t ny = STARPU_BLOCK_GET_NY(dst_interface);
+	uint32_t nz = STARPU_BLOCK_GET_NZ(dst_interface);
+
+	size_t elemsize = STARPU_BLOCK_GET_ELEMSIZE(dst_interface);
+
+	uint32_t src_ldy = STARPU_BLOCK_GET_LDY(src_interface);
+	uint32_t src_ldz = STARPU_BLOCK_GET_LDZ(src_interface);
+	uint32_t dst_ldy = STARPU_BLOCK_GET_LDY(dst_interface);
+	uint32_t dst_ldz = STARPU_BLOCK_GET_LDZ(dst_interface);
+
+	void *src_ptr = (void *)STARPU_BLOCK_GET_PTR(src_interface);
+	void *dst_ptr = (void *)STARPU_BLOCK_GET_PTR(dst_interface);
+
+	unsigned y, z;
+	for (z = 0; z < nz; ++z)
+	{
+		for (y = 0; y < ny; ++y)
+		{
+			uint32_t src_offset = (y*src_ldy + z*src_ldz) * elemsize;
+			uint32_t dst_offset = (y*dst_ldy + z*dst_ldz) * elemsize;
+
+			_starpu_scc_copy_sink_to_src(src_ptr + src_offset, src_node,
+							dst_ptr + dst_offset, dst_node, nx*elemsize);
+		}
+	}
+
+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, nx*ny*nz*elemsize);
+
+	return 0;
+}
+
+static int copy_scc_sink_to_sink(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
+{
+	uint32_t nx = STARPU_BLOCK_GET_NX(dst_interface);
+	uint32_t ny = STARPU_BLOCK_GET_NY(dst_interface);
+	uint32_t nz = STARPU_BLOCK_GET_NZ(dst_interface);
+
+	size_t elemsize = STARPU_BLOCK_GET_ELEMSIZE(dst_interface);
+
+	uint32_t src_ldy = STARPU_BLOCK_GET_LDY(src_interface);
+	uint32_t src_ldz = STARPU_BLOCK_GET_LDZ(src_interface);
+	uint32_t dst_ldy = STARPU_BLOCK_GET_LDY(dst_interface);
+	uint32_t dst_ldz = STARPU_BLOCK_GET_LDZ(dst_interface);
+
+	void *src_ptr = (void *)STARPU_BLOCK_GET_PTR(src_interface);
+	void *dst_ptr = (void *)STARPU_BLOCK_GET_PTR(dst_interface);
+
+	unsigned y, z;
+	for (z = 0; z < nz; ++z)
+	{
+		for (y = 0; y < ny; ++y)
+		{
+			uint32_t src_offset = (y*src_ldy + z*src_ldz) * elemsize;
+			uint32_t dst_offset = (y*dst_ldy + z*dst_ldz) * elemsize;
+
+			_starpu_scc_copy_sink_to_sink(src_ptr + src_offset, src_node,
+					dst_ptr + dst_offset, dst_node, nx*elemsize);
+		}
+	}
+
+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, nx*ny*nz*elemsize);
+
+	return 0;
+}
+#endif /* STARPU_USE_SCC */
+
+#ifdef STARPU_USE_MIC
+static int copy_mic_common(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node,
+						   int (*copy_func)(void *, unsigned, void *, unsigned, size_t))
+{
+	struct starpu_block_interface *src_block = src_interface;
+	struct starpu_block_interface *dst_block = dst_interface;
+	
+	uint32_t nx = dst_block->nx;
+	uint32_t ny = dst_block->ny;
+	uint32_t nz = dst_block->nz;
+	size_t elemsize = dst_block->elemsize;
+
+	uint32_t ldy_src = src_block->ldy;
+	uint32_t ldz_src = src_block->ldz;
+	uint32_t ldy_dst = dst_block->ldy;
+	uint32_t ldz_dst = dst_block->ldz;
+
+	uintptr_t ptr_src = src_block->ptr;
+	uintptr_t ptr_dst = dst_block->ptr;
+
+	unsigned y, z;
+	for (z = 0; z < nz; z++)
+	{
+		for (y = 0; y < ny; y++)
+		{
+			uint32_t src_offset = (y*ldy_src + z*ldz_src)*elemsize;
+			uint32_t dst_offset = (y*ldy_dst + z*ldz_dst)*elemsize;
+
+			copy_func((void *)(ptr_src + src_offset), src_node, (void *)(ptr_dst + dst_offset), dst_node, nx*elemsize);
+		}
+	}
+
+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, nx*ny*nz*elemsize);
+
+	return 0;
+
+}
+static int copy_ram_to_mic(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
+{
+	return copy_mic_common(src_interface, src_node, dst_interface, dst_node, _starpu_mic_copy_ram_to_mic);
+}
+
+static int copy_mic_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
+{
+	return copy_mic_common(src_interface, src_node, dst_interface, dst_node, _starpu_mic_copy_mic_to_ram);
+}
+
+static int copy_ram_to_mic_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
+{
+	copy_mic_common(src_interface, src_node, dst_interface, dst_node, _starpu_mic_copy_ram_to_mic_async);
+	return -EAGAIN;
+}
+
+static int copy_mic_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
+{
+	copy_mic_common(src_interface, src_node, dst_interface, dst_node, _starpu_mic_copy_mic_to_ram_async);
+	return -EAGAIN;
+}
+#endif
+
 /* as not all platform easily have a BLAS lib installed ... */
 /* as not all platform easily have a BLAS lib installed ... */
 static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
 static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
 {
 {

+ 4 - 2
src/datawizard/interfaces/coo_interface.c

@@ -89,6 +89,7 @@ register_coo_handle(starpu_data_handle_t handle, unsigned home_node,
 			local_interface->rows = 0;
 			local_interface->rows = 0;
 		}
 		}
 
 
+		local_interface->id = coo_interface->id;
 		local_interface->nx = coo_interface->nx;
 		local_interface->nx = coo_interface->nx;
 		local_interface->ny = coo_interface->ny;
 		local_interface->ny = coo_interface->ny;
 		local_interface->n_values = coo_interface->n_values;
 		local_interface->n_values = coo_interface->n_values;
@@ -189,7 +190,7 @@ display_coo_interface(starpu_data_handle_t handle, FILE *f)
 	fprintf(f, "%u\t%u", coo_interface->nx, coo_interface->ny);
 	fprintf(f, "%u\t%u", coo_interface->nx, coo_interface->ny);
 }
 }
 
 
-struct starpu_data_interface_ops _starpu_interface_coo_ops =
+struct starpu_data_interface_ops starpu_interface_coo_ops =
 {
 {
 	.register_data_handle  = register_coo_handle,
 	.register_data_handle  = register_coo_handle,
 	.allocate_data_on_node = allocate_coo_buffer_on_node,
 	.allocate_data_on_node = allocate_coo_buffer_on_node,
@@ -212,6 +213,7 @@ starpu_coo_data_register(starpu_data_handle_t *handleptr, unsigned home_node,
 {
 {
 	struct starpu_coo_interface coo_interface =
 	struct starpu_coo_interface coo_interface =
 	{
 	{
+		.id = STARPU_COO_INTERFACE_ID,
 		.values = values,
 		.values = values,
 		.columns = columns,
 		.columns = columns,
 		.rows = rows,
 		.rows = rows,
@@ -222,5 +224,5 @@ starpu_coo_data_register(starpu_data_handle_t *handleptr, unsigned home_node,
 	};
 	};
 
 
 	starpu_data_register(handleptr, home_node, &coo_interface,
 	starpu_data_register(handleptr, home_node, &coo_interface,
-			     &_starpu_interface_coo_ops);
+			     &starpu_interface_coo_ops);
 }
 }

+ 1 - 0
src/datawizard/interfaces/csr_filters.c

@@ -46,6 +46,7 @@ void starpu_csr_filter_vertical_block(void *father_interface, void *child_interf
 
 
 	uint32_t local_nnz = rowptr[first_index + child_nrow] - rowptr[first_index];
 	uint32_t local_nnz = rowptr[first_index + child_nrow] - rowptr[first_index];
 
 
+	csr_child->id = csr_father->id;
 	csr_child->nnz = local_nnz;
 	csr_child->nnz = local_nnz;
 	csr_child->nrow = child_nrow;
 	csr_child->nrow = child_nrow;
 	csr_child->firstentry = local_firstentry;
 	csr_child->firstentry = local_firstentry;

+ 6 - 2
src/datawizard/interfaces/csr_interface.c

@@ -27,6 +27,8 @@
 #include <starpu_cuda.h>
 #include <starpu_cuda.h>
 #include <starpu_opencl.h>
 #include <starpu_opencl.h>
 #include <drivers/opencl/driver_opencl.h>
 #include <drivers/opencl/driver_opencl.h>
+#include <drivers/scc/driver_scc_source.h>
+#include <drivers/mic/driver_mic_source.h>
 
 
 static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
 static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
 
 
@@ -42,7 +44,7 @@ static size_t csr_interface_get_size(starpu_data_handle_t handle);
 static int csr_compare(void *data_interface_a, void *data_interface_b);
 static int csr_compare(void *data_interface_a, void *data_interface_b);
 static uint32_t footprint_csr_interface_crc32(starpu_data_handle_t handle);
 static uint32_t footprint_csr_interface_crc32(starpu_data_handle_t handle);
 
 
-static struct starpu_data_interface_ops interface_csr_ops =
+struct starpu_data_interface_ops starpu_interface_csr_ops =
 {
 {
 	.register_data_handle = register_csr_handle,
 	.register_data_handle = register_csr_handle,
 	.allocate_data_on_node = allocate_csr_buffer_on_node,
 	.allocate_data_on_node = allocate_csr_buffer_on_node,
@@ -76,6 +78,7 @@ static void register_csr_handle(starpu_data_handle_t handle, unsigned home_node,
 			local_interface->colind = NULL;
 			local_interface->colind = NULL;
 		}
 		}
 
 
+		local_interface->id = csr_interface->id;
 		local_interface->rowptr = csr_interface->rowptr;
 		local_interface->rowptr = csr_interface->rowptr;
 		local_interface->nnz = csr_interface->nnz;
 		local_interface->nnz = csr_interface->nnz;
 		local_interface->nrow = csr_interface->nrow;
 		local_interface->nrow = csr_interface->nrow;
@@ -91,6 +94,7 @@ void starpu_csr_data_register(starpu_data_handle_t *handleptr, unsigned home_nod
 {
 {
 	struct starpu_csr_interface csr_interface =
 	struct starpu_csr_interface csr_interface =
 	{
 	{
+		.id = STARPU_CSR_INTERFACE_ID,
 		.nnz = nnz,
 		.nnz = nnz,
 		.nrow = nrow,
 		.nrow = nrow,
 		.nzval = nzval,
 		.nzval = nzval,
@@ -100,7 +104,7 @@ void starpu_csr_data_register(starpu_data_handle_t *handleptr, unsigned home_nod
 		.elemsize = elemsize
 		.elemsize = elemsize
 	};
 	};
 
 
-	starpu_data_register(handleptr, home_node, &csr_interface, &interface_csr_ops);
+	starpu_data_register(handleptr, home_node, &csr_interface, &starpu_interface_csr_ops);
 }
 }
 
 
 static uint32_t footprint_csr_interface_crc32(starpu_data_handle_t handle)
 static uint32_t footprint_csr_interface_crc32(starpu_data_handle_t handle)

+ 54 - 6
src/datawizard/interfaces/data_interface.c

@@ -83,6 +83,40 @@ void _starpu_data_interface_shutdown()
 	registered_tag_handles = NULL;
 	registered_tag_handles = NULL;
 }
 }
 
 
+struct starpu_data_interface_ops *_starpu_data_interface_get_ops(unsigned interface_id)
+{
+	switch (interface_id)
+	{
+		case STARPU_MATRIX_INTERFACE_ID:
+			return &starpu_interface_matrix_ops;
+
+		case STARPU_BLOCK_INTERFACE_ID:
+			return &starpu_interface_block_ops;
+
+		case STARPU_VECTOR_INTERFACE_ID:
+			return &starpu_interface_vector_ops;
+
+		case STARPU_CSR_INTERFACE_ID:
+			return &starpu_interface_csr_ops;
+
+		case STARPU_BCSR_INTERFACE_ID:
+			return &starpu_interface_bcsr_ops;
+
+		case STARPU_VARIABLE_INTERFACE_ID:
+			return &starpu_interface_variable_ops;
+
+		case STARPU_VOID_INTERFACE_ID:
+			return &starpu_interface_void_ops;
+
+		case STARPU_MULTIFORMAT_INTERFACE_ID:
+			return &starpu_interface_multiformat_ops;
+
+		default:
+			STARPU_ABORT();
+			return NULL;
+	}
+}
+
 /* Register the mapping from PTR to HANDLE.  If PTR is already mapped to
 /* Register the mapping from PTR to HANDLE.  If PTR is already mapped to
  * some handle, the new mapping shadows the previous one.   */
  * some handle, the new mapping shadows the previous one.   */
 void _starpu_data_register_ram_pointer(starpu_data_handle_t handle, void *ptr)
 void _starpu_data_register_ram_pointer(starpu_data_handle_t handle, void *ptr)
@@ -163,8 +197,8 @@ static void _starpu_register_new_data(starpu_data_handle_t handle,
 
 
 	STARPU_PTHREAD_MUTEX_INIT(&handle->sequential_consistency_mutex, NULL);
 	STARPU_PTHREAD_MUTEX_INIT(&handle->sequential_consistency_mutex, NULL);
 	handle->last_submitted_mode = STARPU_R;
 	handle->last_submitted_mode = STARPU_R;
-	handle->last_submitted_writer = NULL;
-	handle->last_submitted_readers = NULL;
+	handle->last_sync_task = NULL;
+	handle->last_submitted_accessors = NULL;
 	handle->post_sync_tasks = NULL;
 	handle->post_sync_tasks = NULL;
 	handle->post_sync_tasks_cnt = 0;
 	handle->post_sync_tasks_cnt = 0;
 
 
@@ -176,9 +210,9 @@ static void _starpu_register_new_data(starpu_data_handle_t handle,
 	handle->reduction_req_list = _starpu_data_requester_list_new();
 	handle->reduction_req_list = _starpu_data_requester_list_new();
 
 
 #ifdef STARPU_USE_FXT
 #ifdef STARPU_USE_FXT
-	handle->last_submitted_ghost_writer_id_is_valid = 0;
-	handle->last_submitted_ghost_writer_id = 0;
-	handle->last_submitted_ghost_readers_id = NULL;
+	handle->last_submitted_ghost_sync_id_is_valid = 0;
+	handle->last_submitted_ghost_sync_id = 0;
+	handle->last_submitted_ghost_accessors_id = NULL;
 #endif
 #endif
 
 
 	handle->wt_mask = wt_mask;
 	handle->wt_mask = wt_mask;
@@ -569,7 +603,10 @@ static void _starpu_data_unregister(starpu_data_handle_t handle, unsigned cohere
 		 * XXX : This is quite hacky, could we submit a task instead ?
 		 * XXX : This is quite hacky, could we submit a task instead ?
 		 */
 		 */
 		if (_starpu_data_is_multiformat_handle(handle) &&
 		if (_starpu_data_is_multiformat_handle(handle) &&
-			starpu_node_get_kind(handle->mf_node) != STARPU_CPU_RAM)
+			(  starpu_node_get_kind(handle->mf_node) != STARPU_CPU_RAM
+			&& starpu_node_get_kind(handle->mf_node) != STARPU_SCC_RAM
+			&& starpu_node_get_kind(handle->mf_node) != STARPU_SCC_SHM
+			 ))
 		{
 		{
 			_STARPU_DEBUG("Conversion needed\n");
 			_STARPU_DEBUG("Conversion needed\n");
 			void *buffers[1];
 			void *buffers[1];
@@ -598,7 +635,18 @@ static void _starpu_data_unregister(starpu_data_handle_t handle, unsigned cohere
 					break;
 					break;
 				}
 				}
 #endif
 #endif
+#ifdef STARPU_USE_MIC
+				case STARPU_MIC_RAM:
+				{
+					struct starpu_multiformat_data_interface_ops *mf_ops;
+					mf_ops = (struct starpu_multiformat_data_interface_ops *) handle->ops->get_mf_ops(format_interface);
+					cl = mf_ops->mic_to_cpu_cl;
+					break;
+				}
+#endif
 				case STARPU_CPU_RAM:      /* Impossible ! */
 				case STARPU_CPU_RAM:      /* Impossible ! */
+				case STARPU_SCC_RAM:      /* Impossible ! */
+				case STARPU_SCC_SHM:      /* Impossible ! */
 				default:
 				default:
 					STARPU_ABORT();
 					STARPU_ABORT();
 			}
 			}

+ 26 - 1
src/datawizard/interfaces/data_interface.h

@@ -1,7 +1,7 @@
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
 /* StarPU --- Runtime system for heterogeneous multicore architectures.
  *
  *
  * Copyright (C) 2009-2012  Université de Bordeaux 1
  * Copyright (C) 2009-2012  Université de Bordeaux 1
- * Copyright (C) 2010, 2012  Centre National de la Recherche Scientifique
+ * Copyright (C) 2010, 2012, 2013  Centre National de la Recherche Scientifique
  *
  *
  * StarPU is free software; you can redistribute it and/or modify
  * StarPU is free software; you can redistribute it and/or modify
  * it under the terms of the GNU Lesser General Public License as published by
  * it under the terms of the GNU Lesser General Public License as published by
@@ -21,8 +21,31 @@
 #include <starpu.h>
 #include <starpu.h>
 #include <common/config.h>
 #include <common/config.h>
 
 
+/* Generic type representing an interface, for now it's only used before
+ * execution on message-passing devices but it can be useful in other cases.
+ */
+union _starpu_interface
+{
+	struct starpu_matrix_interface matrix;
+	struct starpu_block_interface block;
+	struct starpu_vector_interface vector;
+	struct starpu_csr_interface csr;
+	struct starpu_coo_interface coo;
+	struct starpu_bcsr_interface bcsr;
+	struct starpu_variable_interface variable;
+	struct starpu_multiformat_interface multiformat;
+};
+
 /* Some data interfaces or filters use this interface internally */
 /* Some data interfaces or filters use this interface internally */
 extern struct starpu_data_interface_ops starpu_interface_matrix_ops;
 extern struct starpu_data_interface_ops starpu_interface_matrix_ops;
+extern struct starpu_data_interface_ops starpu_interface_block_ops;
+extern struct starpu_data_interface_ops starpu_interface_vector_ops;
+extern struct starpu_data_interface_ops starpu_interface_csr_ops;
+extern struct starpu_data_interface_ops starpu_interface_bcsr_ops;
+extern struct starpu_data_interface_ops starpu_interface_variable_ops;
+extern struct starpu_data_interface_ops starpu_interface_void_ops;
+extern struct starpu_data_interface_ops starpu_interface_multiformat_ops;
+
 void _starpu_data_free_interfaces(starpu_data_handle_t handle)
 void _starpu_data_free_interfaces(starpu_data_handle_t handle)
 	STARPU_ATTRIBUTE_INTERNAL;
 	STARPU_ATTRIBUTE_INTERNAL;
 
 
@@ -33,6 +56,8 @@ extern void _starpu_data_interface_init(void) STARPU_ATTRIBUTE_INTERNAL;
 extern int _starpu_data_check_not_busy(starpu_data_handle_t handle) STARPU_ATTRIBUTE_INTERNAL;
 extern int _starpu_data_check_not_busy(starpu_data_handle_t handle) STARPU_ATTRIBUTE_INTERNAL;
 extern void _starpu_data_interface_shutdown(void) STARPU_ATTRIBUTE_INTERNAL;
 extern void _starpu_data_interface_shutdown(void) STARPU_ATTRIBUTE_INTERNAL;
 
 
+struct starpu_data_interface_ops *_starpu_data_interface_get_ops(unsigned interface_id);
+
 extern void _starpu_data_register_ram_pointer(starpu_data_handle_t handle,
 extern void _starpu_data_register_ram_pointer(starpu_data_handle_t handle,
 						void *ptr)
 						void *ptr)
 	STARPU_ATTRIBUTE_INTERNAL;
 	STARPU_ATTRIBUTE_INTERNAL;

+ 2 - 0
src/datawizard/interfaces/matrix_filters.c

@@ -41,6 +41,7 @@ void starpu_matrix_filter_block(void *father_interface, void *child_interface, S
 						     &child_nx, &offset);
 						     &child_nx, &offset);
 
 
 	/* update the child's interface */
 	/* update the child's interface */
+	matrix_child->id = matrix_father->id;
 	matrix_child->nx = child_nx;
 	matrix_child->nx = child_nx;
 	matrix_child->ny = ny;
 	matrix_child->ny = ny;
 	matrix_child->elemsize = elemsize;
 	matrix_child->elemsize = elemsize;
@@ -115,6 +116,7 @@ void starpu_matrix_filter_vertical_block(void *father_interface, void *child_int
 						     matrix_father->ld,
 						     matrix_father->ld,
 						     &child_ny, &offset);
 						     &child_ny, &offset);
 
 
+	matrix_child->id = matrix_father->id;
 	matrix_child->nx = nx;
 	matrix_child->nx = nx;
 	matrix_child->ny = child_ny;
 	matrix_child->ny = child_ny;
 	matrix_child->elemsize = elemsize;
 	matrix_child->elemsize = elemsize;

+ 172 - 0
src/datawizard/interfaces/matrix_interface.c

@@ -24,6 +24,8 @@
 #include <starpu_cuda.h>
 #include <starpu_cuda.h>
 #include <starpu_opencl.h>
 #include <starpu_opencl.h>
 #include <drivers/opencl/driver_opencl.h>
 #include <drivers/opencl/driver_opencl.h>
+#include <drivers/scc/driver_scc_source.h>
+#include <drivers/mic/driver_mic_source.h>
 
 
 /* If you can promise that there is no stride in your matrices, you can define this */
 /* If you can promise that there is no stride in your matrices, you can define this */
 // #define NO_STRIDE
 // #define NO_STRIDE
@@ -47,6 +49,17 @@ static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARP
 static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cl_event *event);
 static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cl_event *event);
 static int copy_opencl_to_opencl_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cl_event *event);
 static int copy_opencl_to_opencl_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED, cl_event *event);
 #endif
 #endif
+#ifdef STARPU_USE_SCC
+static int copy_scc_src_to_sink(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
+static int copy_scc_sink_to_src(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
+static int copy_scc_sink_to_sink(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
+#endif
+#ifdef STARPU_USE_MIC
+static int copy_ram_to_mic(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
+static int copy_mic_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
+static int copy_ram_to_mic_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
+static int copy_mic_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
+#endif
 
 
 static const struct starpu_data_copy_methods matrix_copy_data_methods_s =
 static const struct starpu_data_copy_methods matrix_copy_data_methods_s =
 {
 {
@@ -76,6 +89,17 @@ static const struct starpu_data_copy_methods matrix_copy_data_methods_s =
 	.opencl_to_ram_async = copy_opencl_to_ram_async,
 	.opencl_to_ram_async = copy_opencl_to_ram_async,
 	.opencl_to_opencl_async = copy_opencl_to_opencl_async,
 	.opencl_to_opencl_async = copy_opencl_to_opencl_async,
 #endif
 #endif
+#ifdef STARPU_USE_SCC
+	.scc_src_to_sink = copy_scc_src_to_sink,
+	.scc_sink_to_src = copy_scc_sink_to_src,
+	.scc_sink_to_sink = copy_scc_sink_to_sink,
+#endif
+#ifdef STARPU_USE_MIC
+	.ram_to_mic = copy_ram_to_mic,
+	.mic_to_ram = copy_mic_to_ram,
+	.ram_to_mic_async = copy_ram_to_mic_async,
+	.mic_to_ram_async = copy_mic_to_ram_async,
+#endif
 };
 };
 
 
 static void register_matrix_handle(starpu_data_handle_t handle, unsigned home_node, void *data_interface);
 static void register_matrix_handle(starpu_data_handle_t handle, unsigned home_node, void *data_interface);
@@ -127,6 +151,7 @@ static void register_matrix_handle(starpu_data_handle_t handle, unsigned home_no
 			local_interface->ld  = 0;
 			local_interface->ld  = 0;
 		}
 		}
 
 
+		local_interface->id = matrix_interface->id;
 		local_interface->nx = matrix_interface->nx;
 		local_interface->nx = matrix_interface->nx;
 		local_interface->ny = matrix_interface->ny;
 		local_interface->ny = matrix_interface->ny;
 		local_interface->elemsize = matrix_interface->elemsize;
 		local_interface->elemsize = matrix_interface->elemsize;
@@ -151,6 +176,7 @@ void starpu_matrix_data_register(starpu_data_handle_t *handleptr, unsigned home_
 {
 {
 	struct starpu_matrix_interface matrix_interface =
 	struct starpu_matrix_interface matrix_interface =
 	{
 	{
+		.id = STARPU_MATRIX_INTERFACE_ID,
 		.ptr = ptr,
 		.ptr = ptr,
 		.ld = ld,
 		.ld = ld,
 		.nx = nx,
 		.nx = nx,
@@ -160,6 +186,11 @@ void starpu_matrix_data_register(starpu_data_handle_t *handleptr, unsigned home_
                 .offset = 0
                 .offset = 0
 	};
 	};
 
 
+#ifdef STARPU_USE_SCC
+	_starpu_scc_set_offset_in_shared_memory((void*)matrix_interface.ptr,
+			(void**)&(matrix_interface.dev_handle), &(matrix_interface.offset));
+#endif
+
 	starpu_data_register(handleptr, home_node, &matrix_interface, &starpu_interface_matrix_ops);
 	starpu_data_register(handleptr, home_node, &matrix_interface, &starpu_interface_matrix_ops);
 }
 }
 
 
@@ -558,6 +589,147 @@ static int copy_opencl_to_opencl(void *src_interface, unsigned src_node STARPU_A
 
 
 #endif
 #endif
 
 
+#ifdef STARPU_USE_SCC
+static int copy_scc_src_to_sink(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
+{
+	uint32_t nx = STARPU_MATRIX_GET_NX(dst_interface);
+	uint32_t ny = STARPU_MATRIX_GET_NY(dst_interface);
+
+	size_t elemsize = STARPU_MATRIX_GET_ELEMSIZE(dst_interface);
+
+	uint32_t src_ld = STARPU_MATRIX_GET_LD(src_interface);
+	uint32_t dst_ld = STARPU_MATRIX_GET_LD(dst_interface);
+
+	void *src_ptr = (void *)STARPU_MATRIX_GET_PTR(src_interface);
+	void *dst_ptr = (void *)STARPU_MATRIX_GET_PTR(dst_interface);
+
+	unsigned y;
+	for (y = 0; y < ny; ++y)
+	{
+		uint32_t src_offset = y*src_ld*elemsize;
+		uint32_t dst_offset = y*dst_ld*elemsize;
+
+		_starpu_scc_copy_src_to_sink(src_ptr + src_offset, src_node,
+						dst_ptr + dst_offset, dst_node, nx*elemsize);
+	}
+
+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, nx*ny*elemsize);
+
+	return 0;
+}
+
+static int copy_scc_sink_to_src(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
+{
+	uint32_t nx = STARPU_MATRIX_GET_NX(dst_interface);
+	uint32_t ny = STARPU_MATRIX_GET_NY(dst_interface);
+
+	size_t elemsize = STARPU_MATRIX_GET_ELEMSIZE(dst_interface);
+
+	uint32_t src_ld = STARPU_MATRIX_GET_LD(src_interface);
+	uint32_t dst_ld = STARPU_MATRIX_GET_LD(dst_interface);
+
+	void *src_ptr = (void *)STARPU_MATRIX_GET_PTR(src_interface);
+	void *dst_ptr = (void *)STARPU_MATRIX_GET_PTR(dst_interface);
+
+	unsigned y;
+	for (y = 0; y < ny; ++y)
+	{
+		uint32_t src_offset = y*src_ld*elemsize;
+		uint32_t dst_offset = y*dst_ld*elemsize;
+
+		_starpu_scc_copy_sink_to_src(src_ptr + src_offset, src_node,
+						dst_ptr + dst_offset, dst_node, nx*elemsize);
+	}
+
+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, nx*ny*elemsize);
+
+	return 0;
+}
+
+static int copy_scc_sink_to_sink(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
+{
+	uint32_t nx = STARPU_MATRIX_GET_NX(dst_interface);
+	uint32_t ny = STARPU_MATRIX_GET_NY(dst_interface);
+
+	size_t elemsize = STARPU_MATRIX_GET_ELEMSIZE(dst_interface);
+
+	uint32_t src_ld = STARPU_MATRIX_GET_LD(src_interface);
+	uint32_t dst_ld = STARPU_MATRIX_GET_LD(dst_interface);
+
+	void *src_ptr = (void *)STARPU_MATRIX_GET_PTR(src_interface);
+	void *dst_ptr = (void *)STARPU_MATRIX_GET_PTR(dst_interface);
+
+	unsigned y;
+	for (y = 0; y < ny; ++y)
+	{
+		uint32_t src_offset = y*src_ld*elemsize;
+		uint32_t dst_offset = y*dst_ld*elemsize;
+
+		_starpu_scc_copy_sink_to_sink(src_ptr + src_offset, src_node,
+						dst_ptr + dst_offset, dst_node, nx*elemsize);
+	}
+
+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, nx*ny*elemsize);
+
+	return 0;
+}
+#endif /* STARPU_USE_SCC */
+
+#ifdef STARPU_USE_MIC
+static int copy_mic_common(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node,
+						   int (*copy_func)(void *, unsigned, void *, unsigned, size_t))
+{
+	struct starpu_matrix_interface *src_matrix = src_interface;
+	struct starpu_matrix_interface *dst_matrix = dst_interface;
+
+	unsigned y;
+	uint32_t nx = dst_matrix->nx;
+	uint32_t ny = dst_matrix->ny;
+	size_t elemsize = dst_matrix->elemsize;
+
+	uint32_t ld_src = src_matrix->ld;
+	uint32_t ld_dst = dst_matrix->ld;
+
+	uintptr_t ptr_src = src_matrix->ptr;
+	uintptr_t ptr_dst = dst_matrix->ptr;
+
+
+	for (y = 0; y < ny; y++)
+	{
+		uint32_t src_offset = y*ld_src*elemsize;
+		uint32_t dst_offset = y*ld_dst*elemsize;
+
+		copy_func((void *)(ptr_src + src_offset), src_node, (void *)(ptr_dst + dst_offset), dst_node, nx*elemsize);
+	}
+
+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, (size_t)nx*ny*elemsize);
+
+	return 0;
+}
+
+static int copy_ram_to_mic(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
+{
+	return copy_mic_common(src_interface, src_node, dst_interface, dst_node, _starpu_mic_copy_ram_to_mic);
+}
+
+static int copy_mic_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
+{
+	return copy_mic_common(src_interface, src_node, dst_interface, dst_node, _starpu_mic_copy_mic_to_ram);
+}
+
+static int copy_ram_to_mic_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
+{
+	copy_mic_common(src_interface, src_node, dst_interface, dst_node, _starpu_mic_copy_ram_to_mic_async);
+	return -EAGAIN;
+}
+
+static int copy_mic_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
+{
+	copy_mic_common(src_interface, src_node, dst_interface, dst_node, _starpu_mic_copy_mic_to_ram_async);
+	return -EAGAIN;
+}
+#endif
+
 /* as not all platform easily have a  lib installed ... */
 /* as not all platform easily have a  lib installed ... */
 static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
 static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node STARPU_ATTRIBUTE_UNUSED)
 {
 {

+ 118 - 5
src/datawizard/interfaces/multiformat_interface.c

@@ -23,6 +23,7 @@
 #include <starpu_cuda.h>
 #include <starpu_cuda.h>
 #include <starpu_opencl.h>
 #include <starpu_opencl.h>
 #include <drivers/opencl/driver_opencl.h>
 #include <drivers/opencl/driver_opencl.h>
+#include <drivers/mic/driver_mic_source.h>
 #include <core/task.h>
 #include <core/task.h>
 
 
 static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node);
 static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node);
@@ -41,6 +42,12 @@ static int copy_opencl_to_opencl(void *src_interface, unsigned src_node STARPU_A
 static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node, cl_event *event);
 static int copy_ram_to_opencl_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node, cl_event *event);
 static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node, cl_event *event);
 static int copy_opencl_to_ram_async(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node, cl_event *event);
 #endif
 #endif
+#ifdef STARPU_USE_MIC
+static int copy_ram_to_mic(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
+static int copy_mic_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
+static int copy_ram_to_mic_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
+static int copy_mic_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node);
+#endif
 
 
 static const struct starpu_data_copy_methods multiformat_copy_data_methods_s =
 static const struct starpu_data_copy_methods multiformat_copy_data_methods_s =
 {
 {
@@ -65,6 +72,12 @@ static const struct starpu_data_copy_methods multiformat_copy_data_methods_s =
         .ram_to_opencl_async = copy_ram_to_opencl_async,
         .ram_to_opencl_async = copy_ram_to_opencl_async,
 	.opencl_to_ram_async = copy_opencl_to_ram_async,
 	.opencl_to_ram_async = copy_opencl_to_ram_async,
 #endif
 #endif
+#ifdef STARPU_USE_MIC
+	.ram_to_mic = copy_ram_to_mic,
+	.mic_to_ram = copy_mic_to_ram,
+	.ram_to_mic_async = copy_ram_to_mic_async,
+	.mic_to_ram_async = copy_mic_to_ram_async,
+#endif
 };
 };
 
 
 static void register_multiformat_handle(starpu_data_handle_t handle, unsigned home_node, void *data_interface);
 static void register_multiformat_handle(starpu_data_handle_t handle, unsigned home_node, void *data_interface);
@@ -86,7 +99,7 @@ get_mf_ops(void *data_interface)
 	return mf->ops;
 	return mf->ops;
 }
 }
 
 
-static struct starpu_data_interface_ops interface_multiformat_ops =
+struct starpu_data_interface_ops starpu_interface_multiformat_ops =
 {
 {
 	.register_data_handle  = register_multiformat_handle,
 	.register_data_handle  = register_multiformat_handle,
 	.allocate_data_on_node = allocate_multiformat_buffer_on_node,
 	.allocate_data_on_node = allocate_multiformat_buffer_on_node,
@@ -121,6 +134,10 @@ static void *multiformat_handle_to_pointer(starpu_data_handle_t handle, unsigned
 		case STARPU_OPENCL_RAM:
 		case STARPU_OPENCL_RAM:
 			return multiformat_interface->opencl_ptr;
 			return multiformat_interface->opencl_ptr;
 #endif
 #endif
+#ifdef STARPU_USE_MIC
+		case STARPU_MIC_RAM:
+			return multiformat_interface->mic_ptr;
+#endif
 		default:
 		default:
 			STARPU_ABORT();
 			STARPU_ABORT();
 	}
 	}
@@ -147,6 +164,9 @@ static void register_multiformat_handle(starpu_data_handle_t handle, unsigned ho
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
 			local_interface->opencl_ptr = multiformat_interface->opencl_ptr;
 			local_interface->opencl_ptr = multiformat_interface->opencl_ptr;
 #endif
 #endif
+#ifdef STARPU_USE_MIC
+			local_interface->mic_ptr    = multiformat_interface->mic_ptr;
+#endif
 		}
 		}
 		else
 		else
 		{
 		{
@@ -157,7 +177,11 @@ static void register_multiformat_handle(starpu_data_handle_t handle, unsigned ho
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
 			local_interface->opencl_ptr = NULL;
 			local_interface->opencl_ptr = NULL;
 #endif
 #endif
+#ifdef STARPU_USE_MIC
+			local_interface->mic_ptr    = NULL;
+#endif
 		}
 		}
+		local_interface->id = multiformat_interface->id;
 		local_interface->nx = multiformat_interface->nx;
 		local_interface->nx = multiformat_interface->nx;
 		local_interface->ops = multiformat_interface->ops;
 		local_interface->ops = multiformat_interface->ops;
 	}
 	}
@@ -173,17 +197,21 @@ void starpu_multiformat_data_register(starpu_data_handle_t *handleptr,
 	_starpu_codelet_check_deprecated_fields(format_ops->opencl_to_cpu_cl);
 	_starpu_codelet_check_deprecated_fields(format_ops->opencl_to_cpu_cl);
 	_starpu_codelet_check_deprecated_fields(format_ops->cpu_to_cuda_cl);
 	_starpu_codelet_check_deprecated_fields(format_ops->cpu_to_cuda_cl);
 	_starpu_codelet_check_deprecated_fields(format_ops->cuda_to_cpu_cl);
 	_starpu_codelet_check_deprecated_fields(format_ops->cuda_to_cpu_cl);
+	_starpu_codelet_check_deprecated_fields(format_ops->cpu_to_mic_cl);
+	_starpu_codelet_check_deprecated_fields(format_ops->mic_to_cpu_cl);
 
 
 	struct starpu_multiformat_interface multiformat =
 	struct starpu_multiformat_interface multiformat =
 	{
 	{
+		.id         = STARPU_MULTIFORMAT_INTERFACE_ID,
 		.cpu_ptr    = ptr,
 		.cpu_ptr    = ptr,
 		.cuda_ptr   = NULL,
 		.cuda_ptr   = NULL,
 		.opencl_ptr = NULL,
 		.opencl_ptr = NULL,
+		.mic_ptr    = NULL,
 		.nx         = nobjects,
 		.nx         = nobjects,
 		.ops        = format_ops
 		.ops        = format_ops
 	};
 	};
 
 
-	starpu_data_register(handleptr, home_node, &multiformat, &interface_multiformat_ops);
+	starpu_data_register(handleptr, home_node, &multiformat, &starpu_interface_multiformat_ops);
 }
 }
 
 
 static uint32_t footprint_multiformat_interface_crc32(starpu_data_handle_t handle)
 static uint32_t footprint_multiformat_interface_crc32(starpu_data_handle_t handle)
@@ -204,6 +232,9 @@ static int multiformat_compare(void *data_interface_a, void *data_interface_b)
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
 			&& (multiformat_a->ops->opencl_elemsize == multiformat_b->ops->opencl_elemsize)
 			&& (multiformat_a->ops->opencl_elemsize == multiformat_b->ops->opencl_elemsize)
 #endif
 #endif
+#ifdef STARPU_USE_MIC
+		    && (multiformat_a->ops->mic_elemsize == multiformat_b->ops->mic_elemsize)
+#endif
 		);
 		);
 }
 }
 
 
@@ -263,16 +294,26 @@ static starpu_ssize_t allocate_multiformat_buffer_on_node(void *data_interface_,
 		goto fail_opencl;
 		goto fail_opencl;
 	multiformat_interface->opencl_ptr = (void *) addr;
 	multiformat_interface->opencl_ptr = (void *) addr;
 #endif
 #endif
+#ifdef STARPU_USE_MIC
+	size = multiformat_interface->nx * multiformat_interface->ops->mic_elemsize;
+	allocated_memory += size;
+	addr = starpu_malloc_on_node(dst_node, size);
+	if (!addr)
+		goto fail_mic;
+	multiformat_interface->mic_ptr = (void *) addr;
+#endif
 
 
 	return allocated_memory;
 	return allocated_memory;
 
 
+#ifdef STARPU_USE_MIC
+fail_mic:
+#endif
 #ifdef STARPU_USE_OPENCL
 #ifdef STARPU_USE_OPENCL
+	starpu_free_on_node(dst_node, (uintptr_t) multiformat_interface->opencl_ptr, multiformat_interface->nx * multiformat_interface->ops->opencl_elemsize);
 fail_opencl:
 fail_opencl:
-#ifdef STARPU_USE_CUDA
-	starpu_free_on_node(dst_node, (uintptr_t) multiformat_interface->cuda_ptr, multiformat_interface->nx * multiformat_interface->ops->cuda_elemsize);
-#endif
 #endif
 #endif
 #ifdef STARPU_USE_CUDA
 #ifdef STARPU_USE_CUDA
+	starpu_free_on_node(dst_node, (uintptr_t) multiformat_interface->cuda_ptr, multiformat_interface->nx * multiformat_interface->ops->cuda_elemsize);
 fail_cuda:
 fail_cuda:
 #endif
 #endif
 	starpu_free_on_node(dst_node, (uintptr_t) multiformat_interface->cpu_ptr, multiformat_interface->nx * multiformat_interface->ops->cpu_elemsize);
 	starpu_free_on_node(dst_node, (uintptr_t) multiformat_interface->cpu_ptr, multiformat_interface->nx * multiformat_interface->ops->cpu_elemsize);
@@ -298,6 +339,11 @@ static void free_multiformat_buffer_on_node(void *data_interface, unsigned node)
 				   multiformat_interface->nx * multiformat_interface->ops->opencl_elemsize);
 				   multiformat_interface->nx * multiformat_interface->ops->opencl_elemsize);
 	multiformat_interface->opencl_ptr = NULL;
 	multiformat_interface->opencl_ptr = NULL;
 #endif
 #endif
+#ifdef STARPU_USE_MIC
+	starpu_free_on_node(node, (uintptr_t) multiformat_interface->mic_ptr,
+				   multiformat_interface->nx * multiformat_interface->ops->mic_elemsize);
+	multiformat_interface->mic_ptr = NULL;
+#endif
 }
 }
 
 
 
 
@@ -645,3 +691,70 @@ static int copy_opencl_to_opencl(void *src_interface, unsigned src_node,
 	return 0;
 	return 0;
 }
 }
 #endif
 #endif
+
+#ifdef STARPU_USE_MIC
+static int copy_mic_common_ram_to_mic(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node,
+						   int (*copy_func)(void *, unsigned, void *, unsigned, size_t))
+{
+	struct starpu_multiformat_interface *src_multiformat = src_interface;
+	struct starpu_multiformat_interface *dst_multiformat = dst_interface;
+
+	STARPU_ASSERT(src_multiformat != NULL);
+	STARPU_ASSERT(dst_multiformat != NULL);
+	STARPU_ASSERT(dst_multiformat->ops != NULL);
+
+	size_t size = dst_multiformat->nx * dst_multiformat->ops->mic_elemsize;
+	if (src_multiformat->mic_ptr == NULL)
+	{
+		src_multiformat->mic_ptr = malloc(size);
+		if (src_multiformat->mic_ptr == NULL)
+			return -ENOMEM;
+	}
+	
+	copy_func(src_multiformat->cpu_ptr, src_node, dst_multiformat->cpu_ptr, dst_node, size);
+
+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, size);
+
+	return 0;
+}
+
+static int copy_mic_common_mic_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node,
+						   int (*copy_func)(void *, unsigned, void *, unsigned, size_t))
+{
+	struct starpu_multiformat_interface *src_multiformat = src_interface;
+	struct starpu_multiformat_interface *dst_multiformat = dst_interface;
+
+	STARPU_ASSERT(src_multiformat != NULL);
+	STARPU_ASSERT(dst_multiformat != NULL);
+	STARPU_ASSERT(dst_multiformat->ops != NULL);
+			
+	size_t size = src_multiformat->nx * src_multiformat->ops->mic_elemsize;
+	copy_func(src_multiformat->mic_ptr, src_node, dst_multiformat->mic_ptr, dst_node, size);
+
+	_STARPU_TRACE_DATA_COPY(src_node, dst_node, size);
+
+	return 0;
+}
+
+static int copy_ram_to_mic(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
+{
+	return copy_mic_common_ram_to_mic(src_interface, src_node, dst_interface, dst_node, _starpu_mic_copy_ram_to_mic);
+}
+
+static int copy_mic_to_ram(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
+{
+	return copy_mic_common_mic_to_ram(src_interface, src_node, dst_interface, dst_node, _starpu_mic_copy_mic_to_ram);
+}
+
+static int copy_ram_to_mic_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
+{
+	copy_mic_common_ram_to_mic(src_interface, src_node, dst_interface, dst_node, _starpu_mic_copy_ram_to_mic_async);
+	return -EAGAIN;
+}
+
+static int copy_mic_to_ram_async(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node)
+{
+	copy_mic_common_mic_to_ram(src_interface, src_node, dst_interface, dst_node, _starpu_mic_copy_mic_to_ram_async);
+	return -EAGAIN;
+}
+#endif

+ 20 - 4
src/datawizard/interfaces/variable_interface.c

@@ -24,6 +24,8 @@
 #include <starpu_cuda.h>
 #include <starpu_cuda.h>
 #include <starpu_opencl.h>
 #include <starpu_opencl.h>
 #include <drivers/opencl/driver_opencl.h>
 #include <drivers/opencl/driver_opencl.h>
+#include <drivers/scc/driver_scc_source.h>
+#include <drivers/mic/driver_mic_source.h>
 
 
 static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
 static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
 
 
@@ -41,7 +43,7 @@ static uint32_t footprint_variable_interface_crc32(starpu_data_handle_t handle);
 static int variable_compare(void *data_interface_a, void *data_interface_b);
 static int variable_compare(void *data_interface_a, void *data_interface_b);
 static void display_variable_interface(starpu_data_handle_t handle, FILE *f);
 static void display_variable_interface(starpu_data_handle_t handle, FILE *f);
 
 
-static struct starpu_data_interface_ops interface_variable_ops =
+struct starpu_data_interface_ops starpu_interface_variable_ops =
 {
 {
 	.register_data_handle = register_variable_handle,
 	.register_data_handle = register_variable_handle,
 	.allocate_data_on_node = allocate_variable_buffer_on_node,
 	.allocate_data_on_node = allocate_variable_buffer_on_node,
@@ -65,6 +67,7 @@ static void *variable_handle_to_pointer(starpu_data_handle_t handle, unsigned no
 
 
 static void register_variable_handle(starpu_data_handle_t handle, unsigned home_node, void *data_interface)
 static void register_variable_handle(starpu_data_handle_t handle, unsigned home_node, void *data_interface)
 {
 {
+	struct starpu_variable_interface *variable_interface = (struct starpu_variable_interface *)data_interface;
 	unsigned node;
 	unsigned node;
 	for (node = 0; node < STARPU_MAXNODES; node++)
 	for (node = 0; node < STARPU_MAXNODES; node++)
 	{
 	{
@@ -73,14 +76,19 @@ static void register_variable_handle(starpu_data_handle_t handle, unsigned home_
 
 
 		if (node == home_node)
 		if (node == home_node)
 		{
 		{
-			local_interface->ptr = STARPU_VARIABLE_GET_PTR(data_interface);
+			local_interface->ptr = variable_interface->ptr;
+			local_interface->dev_handle = variable_interface->dev_handle;
+			local_interface->offset = variable_interface->offset;
 		}
 		}
 		else
 		else
 		{
 		{
 			local_interface->ptr = 0;
 			local_interface->ptr = 0;
+			local_interface->dev_handle = 0;
+			local_interface->offset = 0;
 		}
 		}
 
 
-		local_interface->elemsize = STARPU_VARIABLE_GET_ELEMSIZE(data_interface);
+		local_interface->id = variable_interface->id;
+		local_interface->elemsize = variable_interface->elemsize;
 	}
 	}
 }
 }
 
 
@@ -90,11 +98,19 @@ void starpu_variable_data_register(starpu_data_handle_t *handleptr, unsigned hom
 {
 {
 	struct starpu_variable_interface variable =
 	struct starpu_variable_interface variable =
 	{
 	{
+		.id = STARPU_VARIABLE_INTERFACE_ID,
 		.ptr = ptr,
 		.ptr = ptr,
+		.dev_handle = ptr,
+		.offset = 0,
 		.elemsize = elemsize
 		.elemsize = elemsize
 	};
 	};
 
 
-	starpu_data_register(handleptr, home_node, &variable, &interface_variable_ops);
+#ifdef STARPU_USE_SCC
+	_starpu_scc_set_offset_in_shared_memory((void*)variable.ptr, (void**)&(variable.dev_handle),
+			&(variable.offset));
+#endif
+
+	starpu_data_register(handleptr, home_node, &variable, &starpu_interface_variable_ops);
 }
 }
 
 
 
 

+ 4 - 0
src/datawizard/interfaces/vector_filters.c

@@ -35,6 +35,7 @@ void starpu_vector_filter_block(void *father_interface, void *child_interface, S
 	_starpu_filter_nparts_compute_chunk_size_and_offset(nx, nchunks, elemsize, id, 1,
 	_starpu_filter_nparts_compute_chunk_size_and_offset(nx, nchunks, elemsize, id, 1,
 						     &child_nx, &offset);
 						     &child_nx, &offset);
 
 
+	vector_child->id = vector_father->id;
 	vector_child->nx = child_nx;
 	vector_child->nx = child_nx;
 	vector_child->elemsize = elemsize;
 	vector_child->elemsize = elemsize;
 
 
@@ -95,6 +96,8 @@ void starpu_vector_filter_divide_in_2(void *father_interface, void *child_interf
 
 
 	STARPU_ASSERT_MSG(length_first < nx, "First part is too long: %u vs %u", length_first, nx);
 	STARPU_ASSERT_MSG(length_first < nx, "First part is too long: %u vs %u", length_first, nx);
 
 
+	vector_child->id = vector_father->id;
+
 	/* this is the first child */
 	/* this is the first child */
 	if (id == 0)
 	if (id == 0)
 	{
 	{
@@ -138,6 +141,7 @@ void starpu_vector_filter_list(void *father_interface, void *child_interface, st
 
 
 	uint32_t chunk_size = length_tab[id];
 	uint32_t chunk_size = length_tab[id];
 
 
+	vector_child->id = vector_father->id;
 	vector_child->nx = chunk_size;
 	vector_child->nx = chunk_size;
 	vector_child->elemsize = elemsize;
 	vector_child->elemsize = elemsize;
 
 

+ 10 - 2
src/datawizard/interfaces/vector_interface.c

@@ -24,6 +24,8 @@
 #include <starpu_cuda.h>
 #include <starpu_cuda.h>
 #include <starpu_opencl.h>
 #include <starpu_opencl.h>
 #include <drivers/opencl/driver_opencl.h>
 #include <drivers/opencl/driver_opencl.h>
+#include <drivers/mic/driver_mic_source.h>
+#include <drivers/scc/driver_scc_source.h>
 
 
 static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
 static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
 
 
@@ -41,7 +43,7 @@ static uint32_t footprint_vector_interface_crc32(starpu_data_handle_t handle);
 static int vector_compare(void *data_interface_a, void *data_interface_b);
 static int vector_compare(void *data_interface_a, void *data_interface_b);
 static void display_vector_interface(starpu_data_handle_t handle, FILE *f);
 static void display_vector_interface(starpu_data_handle_t handle, FILE *f);
 
 
-static struct starpu_data_interface_ops interface_vector_ops =
+struct starpu_data_interface_ops starpu_interface_vector_ops =
 {
 {
 	.register_data_handle = register_vector_handle,
 	.register_data_handle = register_vector_handle,
 	.allocate_data_on_node = allocate_vector_buffer_on_node,
 	.allocate_data_on_node = allocate_vector_buffer_on_node,
@@ -89,6 +91,7 @@ static void register_vector_handle(starpu_data_handle_t handle, unsigned home_no
                         local_interface->offset = 0;
                         local_interface->offset = 0;
 		}
 		}
 
 
+		local_interface->id = vector_interface->id;
 		local_interface->nx = vector_interface->nx;
 		local_interface->nx = vector_interface->nx;
 		local_interface->elemsize = vector_interface->elemsize;
 		local_interface->elemsize = vector_interface->elemsize;
 	}
 	}
@@ -100,6 +103,7 @@ void starpu_vector_data_register(starpu_data_handle_t *handleptr, unsigned home_
 {
 {
 	struct starpu_vector_interface vector =
 	struct starpu_vector_interface vector =
 	{
 	{
+		.id = STARPU_VECTOR_INTERFACE_ID,
 		.ptr = ptr,
 		.ptr = ptr,
 		.nx = nx,
 		.nx = nx,
 		.elemsize = elemsize,
 		.elemsize = elemsize,
@@ -107,7 +111,11 @@ void starpu_vector_data_register(starpu_data_handle_t *handleptr, unsigned home_
                 .offset = 0
                 .offset = 0
 	};
 	};
 
 
-	starpu_data_register(handleptr, home_node, &vector, &interface_vector_ops);
+#ifdef STARPU_USE_SCC
+	_starpu_scc_set_offset_in_shared_memory((void*)vector.ptr, (void**)&(vector.dev_handle), &(vector.offset));
+#endif
+
+	starpu_data_register(handleptr, home_node, &vector, &starpu_interface_vector_ops);
 }
 }
 
 
 
 

+ 2 - 2
src/datawizard/interfaces/void_interface.c

@@ -40,7 +40,7 @@ static uint32_t footprint_void_interface_crc32(starpu_data_handle_t handle);
 static int void_compare(void *data_interface_a, void *data_interface_b);
 static int void_compare(void *data_interface_a, void *data_interface_b);
 static void display_void_interface(starpu_data_handle_t handle, FILE *f);
 static void display_void_interface(starpu_data_handle_t handle, FILE *f);
 
 
-static struct starpu_data_interface_ops interface_void_ops =
+struct starpu_data_interface_ops starpu_interface_void_ops =
 {
 {
 	.register_data_handle = register_void_handle,
 	.register_data_handle = register_void_handle,
 	.allocate_data_on_node = allocate_void_buffer_on_node,
 	.allocate_data_on_node = allocate_void_buffer_on_node,
@@ -64,7 +64,7 @@ static void register_void_handle(starpu_data_handle_t handle STARPU_ATTRIBUTE_UN
 /* declare a new data with the void interface */
 /* declare a new data with the void interface */
 void starpu_void_data_register(starpu_data_handle_t *handleptr)
 void starpu_void_data_register(starpu_data_handle_t *handleptr)
 {
 {
-	starpu_data_register(handleptr, 0, NULL, &interface_void_ops);
+	starpu_data_register(handleptr, 0, NULL, &starpu_interface_void_ops);
 }
 }
 
 
 
 

+ 35 - 0
src/datawizard/malloc.c

@@ -177,6 +177,13 @@ int starpu_malloc_flags(void **A, size_t dim, int flags)
 	}
 	}
 #endif /* STARPU_SIMGRID */
 #endif /* STARPU_SIMGRID */
 
 
+	if (_starpu_can_submit_scc_task())
+	{
+#ifdef STARPU_USE_SCC
+		_starpu_scc_allocate_shared_memory(A, dim);
+#endif
+	}
+	else
 #ifdef STARPU_HAVE_POSIX_MEMALIGN
 #ifdef STARPU_HAVE_POSIX_MEMALIGN
 	if (_malloc_align != sizeof(void*))
 	if (_malloc_align != sizeof(void*))
 	{
 	{
@@ -318,6 +325,12 @@ int starpu_free_flags(void *A, size_t dim, int flags)
 	}
 	}
 #endif /* STARPU_SIMGRID */
 #endif /* STARPU_SIMGRID */
 
 
+	if (_starpu_can_submit_scc_task())
+	{
+#ifdef STARPU_USE_SCC
+		_starpu_scc_free_shared_memory(A);
+#endif
+	} else
 	free(A);
 	free(A);
 
 
 out:
 out:
@@ -406,6 +419,18 @@ starpu_malloc_on_node(unsigned dst_node, size_t size)
 #endif
 #endif
 			}
 			}
 #endif
 #endif
+#ifdef STARPU_USE_MIC
+		case STARPU_MIC_RAM:
+			if (_starpu_mic_allocate_memory((void **)(&addr), size, dst_node))
+				addr = 0;
+			break;
+#endif
+#ifdef STARPU_USE_SCC
+		case STARPU_SCC_RAM:
+			if (_starpu_scc_allocate_memory((void **)(&addr), size, dst_node))
+				addr = 0;
+			break;
+#endif
 		default:
 		default:
 			STARPU_ABORT();
 			STARPU_ABORT();
 	}
 	}
@@ -461,6 +486,16 @@ starpu_free_on_node(unsigned dst_node, uintptr_t addr, size_t size)
                         break;
                         break;
 		}
 		}
 #endif
 #endif
+#ifdef STARPU_USE_MIC
+		case STARPU_MIC_RAM:
+			_starpu_mic_free_memory((void*) addr, size, dst_node);
+			break;
+#endif
+#ifdef STARPU_USE_SCC
+		case STARPU_SCC_RAM:
+			_starpu_scc_free_memory((void *) addr, dst_node);
+			break;
+#endif
 		default:
 		default:
 			STARPU_ABORT();
 			STARPU_ABORT();
 	}
 	}

+ 22 - 32
src/datawizard/memalloc.c

@@ -73,44 +73,34 @@ void _starpu_deinit_mem_chunk_lists(void)
 
 
 static void lock_all_subtree(starpu_data_handle_t handle)
 static void lock_all_subtree(starpu_data_handle_t handle)
 {
 {
-	if (handle->nchildren == 0)
-	{
-		/* this is a leaf */
-		while (_starpu_spin_trylock(&handle->header_lock))
-			_starpu_datawizard_progress(_starpu_memory_node_get_local_key(), 0);
-	}
-	else
+	unsigned child;
+
+	/* lock parent */
+	while (_starpu_spin_trylock(&handle->header_lock))
+		_starpu_datawizard_progress(_starpu_memory_node_get_local_key(), 0);
+
+	/* lock all sub-subtrees children */
+	for (child = 0; child < handle->nchildren; child++)
 	{
 	{
-		/* lock all sub-subtrees children */
-		unsigned child;
-		for (child = 0; child < handle->nchildren; child++)
-		{
-			starpu_data_handle_t child_handle = starpu_data_get_child(handle, child);
-			lock_all_subtree(child_handle);
-		}
+		starpu_data_handle_t child_handle = starpu_data_get_child(handle, child);
+		lock_all_subtree(child_handle);
 	}
 	}
 }
 }
 
 
 static void unlock_all_subtree(starpu_data_handle_t handle)
 static void unlock_all_subtree(starpu_data_handle_t handle)
 {
 {
-	if (handle->nchildren == 0)
-	{
-		/* this is a leaf */
-		_starpu_spin_unlock(&handle->header_lock);
-	}
-	else
+	/* lock all sub-subtrees children
+	 * Note that this is done in the reverse order of the
+	 * lock_all_subtree so that we avoid deadlock */
+	unsigned i;
+	for (i =0; i < handle->nchildren; i++)
 	{
 	{
-		/* lock all sub-subtrees children
-		 * Note that this is done in the reverse order of the
-		 * lock_all_subtree so that we avoid deadlock */
-		unsigned i;
-		for (i =0; i < handle->nchildren; i++)
-		{
-			unsigned child = handle->nchildren - 1 - i;
-			starpu_data_handle_t child_handle = starpu_data_get_child(handle, child);
-			unlock_all_subtree(child_handle);
-		}
+		unsigned child = handle->nchildren - 1 - i;
+		starpu_data_handle_t child_handle = starpu_data_get_child(handle, child);
+		unlock_all_subtree(child_handle);
 	}
 	}
+
+	_starpu_spin_unlock(&handle->header_lock);
 }
 }
 
 
 static unsigned may_free_subtree(starpu_data_handle_t handle, unsigned node)
 static unsigned may_free_subtree(starpu_data_handle_t handle, unsigned node)
@@ -336,7 +326,7 @@ static size_t try_to_free_mem_chunk(struct _starpu_mem_chunk *mc, unsigned node)
 	}
 	}
 	else
 	else
 	{
 	{
-		/* try to lock all the leafs of the subtree */
+		/* try to lock all the subtree */
 		lock_all_subtree(handle);
 		lock_all_subtree(handle);
 
 
 		/* check if they are all "free" */
 		/* check if they are all "free" */
@@ -418,7 +408,7 @@ static unsigned try_to_reuse_mem_chunk(struct _starpu_mem_chunk *mc, unsigned no
 
 
 	STARPU_ASSERT(old_data);
 	STARPU_ASSERT(old_data);
 
 
-	/* try to lock all the leafs of the subtree */
+	/* try to lock all the subtree */
 	lock_all_subtree(old_data);
 	lock_all_subtree(old_data);
 
 
 	/* check if they are all "free" */
 	/* check if they are all "free" */

+ 1 - 1
src/datawizard/memory_manager.c

@@ -59,7 +59,7 @@ int _starpu_memory_manager_can_allocate_size(size_t size, unsigned node)
 		used_size[node] += size;
 		used_size[node] += size;
 		ret = 1;
 		ret = 1;
 	}
 	}
-	else if (used_size[node] + size < global_size[node])
+	else if (used_size[node] + size <= global_size[node])
 	{
 	{
 		used_size[node] += size;
 		used_size[node] += size;
 		ret = 1;
 		ret = 1;

+ 35 - 7
src/datawizard/reduction.c

@@ -20,6 +20,8 @@
 #include <util/starpu_data_cpy.h>
 #include <util/starpu_data_cpy.h>
 #include <core/task.h>
 #include <core/task.h>
 #include <datawizard/datawizard.h>
 #include <datawizard/datawizard.h>
+#include <drivers/mic/driver_mic_source.h>
+#include <drivers/mp_common/source_common.h>
 
 
 void starpu_data_set_reduction_methods(starpu_data_handle_t handle,
 void starpu_data_set_reduction_methods(starpu_data_handle_t handle,
 				       struct starpu_codelet *redux_cl,
 				       struct starpu_codelet *redux_cl,
@@ -68,6 +70,12 @@ void _starpu_redux_init_data_replicate(starpu_data_handle_t handle, struct _star
 		case STARPU_OPENCL_WORKER:
 		case STARPU_OPENCL_WORKER:
 			init_func = _starpu_task_get_opencl_nth_implementation(init_cl, 0);
 			init_func = _starpu_task_get_opencl_nth_implementation(init_cl, 0);
 			break;
 			break;
+#ifdef STARPU_USE_MIC
+		case STARPU_MIC_WORKER:
+			init_func = _starpu_mic_src_get_kernel_from_codelet(init_cl, 0);
+			break;
+#endif
+			/* TODO: SCC */
 		default:
 		default:
 			STARPU_ABORT();
 			STARPU_ABORT();
 			break;
 			break;
@@ -75,7 +83,27 @@ void _starpu_redux_init_data_replicate(starpu_data_handle_t handle, struct _star
 
 
 	STARPU_ASSERT(init_func);
 	STARPU_ASSERT(init_func);
 
 
-	init_func(&replicate->data_interface, NULL);
+#ifdef STARPU_USE_MIC
+	if (starpu_worker_get_type(workerid) == STARPU_MIC_WORKER)
+	{
+		const struct _starpu_mp_node *node = _starpu_mic_src_get_actual_thread_mp_node();
+		enum _starpu_mp_command answer;
+		void *arg = NULL;
+		int arg_size = 0;
+
+		// XXX: give the correct coreid.
+		_starpu_src_common_execute_kernel(node,
+						  (void(*)(void))init_func, 0,
+						  &handle, &(replicate->data_interface), 1,
+						  NULL, 0);
+		answer = _starpu_mp_common_recv_command (node, &arg, &arg_size);
+		STARPU_ASSERT (answer == STARPU_EXECUTION_COMPLETED);
+	}
+	else
+#endif
+	{
+		init_func(&replicate->data_interface, NULL);
+	}
 
 
 	replicate->initialized = 1;
 	replicate->initialized = 1;
 }
 }
@@ -305,13 +333,13 @@ void _starpu_data_end_reduction_mode(starpu_data_handle_t handle)
 			redux_task->cl = handle->redux_cl;
 			redux_task->cl = handle->redux_cl;
 			STARPU_ASSERT(redux_task->cl);
 			STARPU_ASSERT(redux_task->cl);
 
 
-			if (!redux_task->cl->modes[0])
-				redux_task->cl->modes[0] = STARPU_RW;
-			if (!redux_task->cl->modes[1])
-				redux_task->cl->modes[1] = STARPU_R;
+			if (!(STARPU_CODELET_GET_MODE(redux_task->cl, 0))
+				STARPU_CODELET_SET_MODE(redux_task->cl, STARPU_RW, 0);
+			if (!(STARPU_CODELET_GET_MODE(redux_task->cl, 1))
+				STARPU_CODELET_SET_MODE(redux_task->cl, STARPU_R, 1);
 
 
-			STARPU_ASSERT_MSG(redux_task->cl->modes[0] == STARPU_RW, "First parameter of reduction codelet has to be RW");
-			STARPU_ASSERT_MSG(redux_task->cl->modes[1] == STARPU_R, "Second parameter of reduction codelet has to be R");
+			STARPU_ASSERT_MSG(STARPU_CODELET_GET_MODE(redux_task->cl, 0) == STARPU_RW, "First parameter of reduction codelet has to be RW");
+			STARPU_ASSERT_MSG(STARPU_CODELET_GET_MODE(redux_task->cl, 1) == STARPU_R, "Second parameter of reduction codelet has to be R");
 
 
 			STARPU_TASK_SET_HANDLE(redux_task, handle, 0);
 			STARPU_TASK_SET_HANDLE(redux_task, handle, 0);
 			STARPU_TASK_SET_HANDLE(redux_task, replicate_array[replicate], 1);
 			STARPU_TASK_SET_HANDLE(redux_task, replicate_array[replicate], 1);

+ 24 - 0
src/debug/traces/starpu_fxt.c

@@ -31,12 +31,16 @@
 static char *cpus_worker_colors[STARPU_NMAXWORKERS] = {"/greens9/7", "/greens9/6", "/greens9/5", "/greens9/4",  "/greens9/9", "/greens9/3",  "/greens9/2",  "/greens9/1"  };
 static char *cpus_worker_colors[STARPU_NMAXWORKERS] = {"/greens9/7", "/greens9/6", "/greens9/5", "/greens9/4",  "/greens9/9", "/greens9/3",  "/greens9/2",  "/greens9/1"  };
 static char *cuda_worker_colors[STARPU_NMAXWORKERS] = {"/ylorrd9/9", "/ylorrd9/6", "/ylorrd9/3", "/ylorrd9/1", "/ylorrd9/8", "/ylorrd9/7", "/ylorrd9/4", "/ylorrd9/2",  "/ylorrd9/1"};
 static char *cuda_worker_colors[STARPU_NMAXWORKERS] = {"/ylorrd9/9", "/ylorrd9/6", "/ylorrd9/3", "/ylorrd9/1", "/ylorrd9/8", "/ylorrd9/7", "/ylorrd9/4", "/ylorrd9/2",  "/ylorrd9/1"};
 static char *opencl_worker_colors[STARPU_NMAXWORKERS] = {"/blues9/9", "/blues9/6", "/blues9/3", "/blues9/1", "/blues9/8", "/blues9/7", "/blues9/4", "/blues9/2",  "/blues9/1"};
 static char *opencl_worker_colors[STARPU_NMAXWORKERS] = {"/blues9/9", "/blues9/6", "/blues9/3", "/blues9/1", "/blues9/8", "/blues9/7", "/blues9/4", "/blues9/2",  "/blues9/1"};
+static char *mic_worker_colors[STARPU_NMAXWORKERS] = {"/reds9/9", "/reds9/6", "/reds9/3", "/reds9/1", "/reds9/8", "/reds9/7", "/reds9/4", "/reds9/2",  "/reds9/1"};
+static char *scc_worker_colors[STARPU_NMAXWORKERS] = {"/reds9/9", "/reds9/6", "/reds9/3", "/reds9/1", "/reds9/8", "/reds9/7", "/reds9/4", "/reds9/2",  "/reds9/1"};
 static char *other_worker_colors[STARPU_NMAXWORKERS] = {"/greys9/9", "/greys9/8", "/greys9/7", "/greys9/6"};
 static char *other_worker_colors[STARPU_NMAXWORKERS] = {"/greys9/9", "/greys9/8", "/greys9/7", "/greys9/6"};
 static char *worker_colors[STARPU_NMAXWORKERS];
 static char *worker_colors[STARPU_NMAXWORKERS];
 
 
 static unsigned opencl_index = 0;
 static unsigned opencl_index = 0;
 static unsigned cuda_index = 0;
 static unsigned cuda_index = 0;
 static unsigned cpus_index = 0;
 static unsigned cpus_index = 0;
+static unsigned mic_index = 0;
+static unsigned scc_index = 0;
 static unsigned other_index = 0;
 static unsigned other_index = 0;
 
 
 static void set_next_other_worker_color(int workerid)
 static void set_next_other_worker_color(int workerid)
@@ -59,6 +63,16 @@ static void set_next_opencl_worker_color(int workerid)
 	worker_colors[workerid] = opencl_worker_colors[opencl_index++];
 	worker_colors[workerid] = opencl_worker_colors[opencl_index++];
 }
 }
 
 
+static void set_next_mic_worker_color(int workerid)
+{
+	worker_colors[workerid] = mic_worker_colors[mic_index++];
+}
+
+static void set_next_scc_worker_color(int workerid)
+{
+	worker_colors[workerid] = scc_worker_colors[scc_index++];
+}
+
 static const char *get_worker_color(int workerid)
 static const char *get_worker_color(int workerid)
 {
 {
 	return worker_colors[workerid];
 	return worker_colors[workerid];
@@ -345,6 +359,16 @@ static void handle_worker_init_start(struct fxt_ev_64 *ev, struct starpu_fxt_opt
 			kindstr = "OPENCL";
 			kindstr = "OPENCL";
 			archtype = STARPU_OPENCL_DEFAULT + devid;
 			archtype = STARPU_OPENCL_DEFAULT + devid;
 			break;
 			break;
+		case _STARPU_FUT_MIC_KEY:
+			set_next_mic_worker_color(workerid);
+			kindstr = "mic";
+			archtype = STARPU_MIC_DEFAULT + devid;
+			break;
+		case _STARPU_FUT_SCC_KEY:
+			set_next_scc_worker_color(workerid);
+			kindstr = "scc";
+			archtype = STARPU_SCC_DEFAULT + devid;
+			break;
 		default:
 		default:
 			STARPU_ABORT();
 			STARPU_ABORT();
 	}
 	}

+ 2 - 1
src/drivers/gordon/driver_gordon.c

@@ -374,7 +374,8 @@ void *gordon_worker_inject(struct _starpu_worker_set *arg)
 					struct _starpu_job_list *chunk_list;
 					struct _starpu_job_list *chunk_list;
 					if (chunk != (nchunks -1))
 					if (chunk != (nchunks -1))
 					{
 					{
-						/* split the list in 2 parts : list = chunk_list | tail */
+						/* split the list in 2 parts :
+						 * list = chunk_list | tail */
 						chunk_list = _starpu_job_list_new();
 						chunk_list = _starpu_job_list_new();
 
 
 						/* find the end */
 						/* find the end */

+ 120 - 0
src/drivers/mic/driver_mic_common.c

@@ -0,0 +1,120 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012  Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+
+#include <starpu.h>
+#include <drivers/mp_common/mp_common.h>
+#include <drivers/mic/driver_mic_common.h>
+
+
+void _starpu_mic_common_report_scif_error(const char *func, const char *file, const int line, const int status)
+{
+	const char *errormsg = strerror(status);
+	printf("Common: oops in %s (%s:%u)... %d: %s \n", func, file, line, status, errormsg);
+	STARPU_ASSERT(0);
+}
+
+/* Handles the error so the caller (which must be generic) doesn't have to
+ * care about it.
+ */
+
+void _starpu_mic_common_send(const struct _starpu_mp_node *node, void *msg, int len)
+{
+	if ((scif_send(node->mp_connection.mic_endpoint, msg, len, SCIF_SEND_BLOCK)) < 0)
+		STARPU_MP_COMMON_REPORT_ERROR(node, errno);
+}
+
+/* Handles the error so the caller (which must be generic) doesn't have to
+ * care about it.
+ */
+
+void _starpu_mic_common_recv(const struct _starpu_mp_node *node, void *msg, int len)
+{
+	if ((scif_recv(node->mp_connection.mic_endpoint, msg, len, SCIF_RECV_BLOCK)) < 0)
+		STARPU_MP_COMMON_REPORT_ERROR(node, errno);
+}
+
+/* Handles the error so the caller (which must be generic) doesn't have to
+ * care about it.
+ */
+void _starpu_mic_common_dt_send(const struct _starpu_mp_node *mp_node, void *msg, int len)
+{
+	if ((scif_send(mp_node->host_sink_dt_connection.mic_endpoint, msg, len, SCIF_SEND_BLOCK)) < 0)
+		STARPU_MP_COMMON_REPORT_ERROR(mp_node, errno);
+}
+
+/* Handles the error so the caller (which must be generic) doesn't have to
+ * care about it.
+ */
+void _starpu_mic_common_dt_recv(const struct _starpu_mp_node *mp_node, void *msg, int len)
+{
+	if ((scif_recv(mp_node->host_sink_dt_connection.mic_endpoint, msg, len, SCIF_SEND_BLOCK)) < 0)
+		STARPU_MP_COMMON_REPORT_ERROR(mp_node, errno);
+}
+
+void _starpu_mic_common_connect(scif_epd_t *endpoint, uint16_t remote_node,
+				uint16_t local_port_number, uint16_t remote_port_number)
+{
+	/* Endpoint only useful for the initialization of the connection */
+	struct scif_portID portID;
+
+	portID.node = remote_node;
+	portID.port = remote_port_number;
+
+	if ((*endpoint = scif_open()) < 0)
+		STARPU_MIC_COMMON_REPORT_SCIF_ERROR(errno);
+
+	if ((scif_bind(*endpoint, local_port_number)) < 0)
+		STARPU_MIC_COMMON_REPORT_SCIF_ERROR(errno);
+
+	_STARPU_DEBUG("Connecting to MIC %d on %d:%d...\n", remote_node, local_port_number, remote_port_number);
+	while (scif_connect(*endpoint, &portID) == -1)
+	{
+		if (errno != ECONNREFUSED)
+			STARPU_MIC_COMMON_REPORT_SCIF_ERROR(errno);
+	}
+	_STARPU_DEBUG("done\n");
+}
+
+/* Wait and accept the connection from the wanted device on the port PORT_NUMBER
+ * and then initialize the connection, the resutling endpoint is stored in ENDPOINT */
+void _starpu_mic_common_accept(scif_epd_t *endpoint, uint16_t port_number)
+{
+	/* Unused variables, only useful to make scif_accept don't cause
+	 * a seg fault when trying to access PEER parameter */
+	struct scif_portID portID;
+
+	/* Endpoint only useful for the initialization of the connection */
+	int init_epd;
+
+	if ((init_epd = scif_open()) < 0)
+		STARPU_MIC_COMMON_REPORT_SCIF_ERROR(errno);
+
+	if ((scif_bind(init_epd, port_number)) < 0)
+		STARPU_MIC_COMMON_REPORT_SCIF_ERROR(errno);
+
+	/* We fix the maximum number of request to 1 as we
+	 * only need one connection, more would be an error */
+	if ((scif_listen(init_epd, 1)) < 0)
+		STARPU_MIC_COMMON_REPORT_SCIF_ERROR(errno);
+
+	_STARPU_DEBUG("MIC accepting connection on %u...\n", port_number);
+	if ((scif_accept(init_epd, &portID, endpoint, SCIF_ACCEPT_SYNC)) < 0)
+		STARPU_MIC_COMMON_REPORT_SCIF_ERROR(errno);
+	_STARPU_DEBUG("done\n", init_epd);
+
+	scif_close(init_epd);
+}

+ 70 - 0
src/drivers/mic/driver_mic_common.h

@@ -0,0 +1,70 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012  Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+
+#ifndef __DRIVER_MIC_COMMON_H__
+#define __DRIVER_MIC_COMMON_H__
+
+
+#include <common/config.h>
+
+
+#ifdef STARPU_USE_MIC
+
+#define STARPU_TO_MIC_ID(id) ((id) + 1)
+
+/* TODO: rather allocate ports on the host and pass them as parameters to the device process */
+#define STARPU_MIC_PORTS_BEGIN 1099
+
+#define STARPU_MIC_SOURCE_PORT_NUMBER STARPU_MIC_PORTS_BEGIN
+#define STARPU_MIC_SINK_PORT_NUMBER(id) ((id) + STARPU_MIC_PORTS_BEGIN + 1)
+
+#define STARPU_MIC_SOURCE_DT_PORT_NUMBER (STARPU_MAXMICDEVS + STARPU_MIC_PORTS_BEGIN + 1)
+#define STARPU_MIC_SINK_DT_PORT_NUMBER(id) ((id) + STARPU_MAXMICDEVS + STARPU_MIC_PORTS_BEGIN + 2)
+
+#define STARPU_MIC_SINK_SINK_DT_PORT_NUMBER(me, peer_id) \
+((me) * STARPU_MAXMICDEVS + (peer_id) +  2 * STARPU_MAXMICDEVS + STARPU_MIC_PORTS_BEGIN + 2)
+
+#define STARPU_MIC_PAGE_SIZE 0x1000
+#define STARPU_MIC_GET_PAGE_SIZE_MULTIPLE(size) \
+(((size) % STARPU_MIC_PAGE_SIZE == 0) ? (size) : (((size) / STARPU_MIC_PAGE_SIZE + 1) * STARPU_MIC_PAGE_SIZE))
+
+#define STARPU_MIC_COMMON_REPORT_SCIF_ERROR(status) \
+	_starpu_mic_common_report_scif_error(__starpu_func__, __FILE__, __LINE__, status)
+
+struct _starpu_mic_free_command
+{
+	void *addr;
+	size_t size;
+};
+
+void _starpu_mic_common_report_scif_error(const char *func, const char *file, int line, const int status);
+
+void _starpu_mic_common_send(const struct _starpu_mp_node *node, void *msg, int len);
+
+void _starpu_mic_common_recv(const struct _starpu_mp_node *node, void *msg, int len);
+
+void _starpu_mic_common_dt_send(const struct _starpu_mp_node *node, void *msg, int len);
+
+void _starpu_mic_common_dt_recv(const struct _starpu_mp_node *node, void *msg, int len);
+
+void _starpu_mic_common_connect(scif_epd_t *endpoint, uint16_t remote_node, 
+				uint16_t local_port_number, uint16_t remote_port_number);
+void _starpu_mic_common_accept(scif_epd_t *endpoint, uint16_t port_number);
+
+#endif /* STARPU_USE_MIC */
+
+#endif /* __DRIVER_MIC_COMMON_H__ */

+ 135 - 0
src/drivers/mic/driver_mic_sink.c

@@ -0,0 +1,135 @@
+/* StarPU --- Runtime system for heterogeneous multicore architectures.
+ *
+ * Copyright (C) 2012  Inria
+ *
+ * StarPU is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * StarPU is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ *
+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
+ */
+
+
+#include <errno.h>
+
+#include <common/COISysInfo_common.h>
+
+#include <starpu.h>
+#include <drivers/mp_common/mp_common.h>
+#include <drivers/mp_common/sink_common.h>
+
+#include "driver_mic_common.h"
+#include "driver_mic_sink.h"
+
+/* Initialize the MIC sink, initializing connection to the source
+ * and to the other devices (not implemented yet).
+ */
+
+void _starpu_mic_sink_init(struct _starpu_mp_node *node)
+{
+	//unsigned int i;
+	
+	/* Initialize connection with the source */
+	_starpu_mic_common_accept(&node->mp_connection.mic_endpoint,
+					 STARPU_MIC_SOURCE_PORT_NUMBER);
+
+	_starpu_mic_common_accept(&node->host_sink_dt_connection.mic_endpoint,
+									 STARPU_MIC_SOURCE_DT_PORT_NUMBER);
+
+	//node->sink_sink_dt_connections = malloc(node->nb_mp_sinks * sizeof(union _starpu_mp_connection));
+
+	//for (i = 0; i < (unsigned int)node->devid; ++i)
+	//	_starpu_mic_common_connect(&node->sink_sink_dt_connections[i].mic_endpoint,
+	//								STARPU_TO_MIC_ID(i),
+	//								STARPU_MIC_SINK_SINK_DT_PORT_NUMBER(node->devid, i),	
+	//								STARPU_MIC_SINK_SINK_DT_PORT_NUMBER(i, node->devid));
+
+	//for (i = node->devid + 1; i < node->nb_mp_sinks; ++i)
+	//	_starpu_mic_common_accept(&node->sink_sink_dt_connections[i].mic_endpoint,
+	//								STARPU_MIC_SINK_SINK_DT_PORT_NUMBER(node->devid, i));
+}
+
+/* Deinitialize the MIC sink, close all the connections.
+ */
+
+void _starpu_mic_sink_deinit(struct _starpu_mp_node *node)
+{
+	//unsigned int i;
+
+	//for (i = 0; i < node->nb_mp_sinks; ++i)
+	//{
+	//	if (i != (unsigned int)node->devid)
+	//		scif_close(node->sink_sink_dt_connections[i].mic_endpoint);
+	//}
+
+	//free(node->sink_sink_dt_connections);
+
+	scif_close(node->host_sink_dt_connection.mic_endpoint);
+	scif_close(node->mp_connection.mic_endpoint);
+}
+
+/* Report an error which occured when using a MIC device
+ * and print this error in a human-readable style
+ */
+
+void _starpu_mic_sink_report_error(const char *func, const char *file, const int line, const int status)
+{
+	const char *errormsg = strerror(status);
+	printf("SINK: oops in %s (%s:%u)... %d: %s \n", func, file, line, status, errormsg);
+	STARPU_ASSERT(0);
+}
+
+/* Return the number of cores on the callee, a MIC device or Processor Xeon
+ */
+unsigned int _starpu_mic_sink_get_nb_core(void)
+{
+	return (unsigned int) COISysGetCoreCount();
+}
+
+/* Allocate memory on the MIC.
+ * Memory is register for remote direct access. */
+void _starpu_mic_sink_allocate(const struct _starpu_mp_node *mp_node, void *arg, int arg_size)
+{
+	STARPU_ASSERT(arg_size == sizeof(size_t));
+
+	void *addr = NULL;
+	size_t size = *(size_t *)(arg);
+	
+	if (posix_memalign(&addr, STARPU_MIC_PAGE_SIZE, size) != 0)
+		_starpu_mp_common_send_command(mp_node, STARPU_ERROR_ALLOCATE, NULL, 0);
+
+#ifndef STARPU_DISABLE_ASYNCHRONOUS_MIC_COPY
+	scif_epd_t epd = mp_node->host_sink_dt_connection.mic_endpoint;
+	size_t window_size = STARPU_MIC_GET_PAGE_SIZE_MULTIPLE(size);
+
+	if (scif_register(epd, addr, window_size, (off_t)addr, SCIF_PROT_READ | SCIF_PROT_WRITE, SCIF_MAP_FIXED) < 0)
+	{
+		free(addr);
+		_starpu_mp_common_send_command(mp_node, STARPU_ERROR_ALLOCATE, NULL, 0);
+	}
+#endif
+	
+	_starpu_mp_common_send_command(mp_node, STARPU_ANSWER_ALLOCATE, &addr, sizeof(addr));
+}
+
+/* Unregister and free memory. */
+void _starpu_mic_sink_free(const struct _starpu_mp_node *mp_node STARPU_ATTRIBUTE_UNUSED, void *arg, int arg_size)
+{
+	STARPU_ASSERT(arg_size == sizeof(struct _starpu_mic_free_command));
+
+	void *addr = ((struct _starpu_mic_free_command *)arg)->addr;
+	
+#ifndef STARPU_DISABLE_ASYNCHRONOUS_MIC_COPY
+	scif_epd_t epd = mp_node->host_sink_dt_connection.mic_endpoint;
+	size_t size = ((struct _starpu_mic_free_command *)arg)->size;
+	size_t window_size = STARPU_MIC_GET_PAGE_SIZE_MULTIPLE(size);
+
+	scif_unregister(epd, (off_t)addr, window_size);
+#endif
+	free(addr);
+}

+ 0 - 0
src/drivers/mic/driver_mic_sink.h


Some files were not shown because too many files changed in this diff