5 years ago · ff616a37aa
--- a/configure.ac
+++ b/configure.ac
@@ -323,25 +323,43 @@ if test x$enable_simgrid = xyes ; then
 
				 else
			
 
				     DEFAULT_MPICC=mpicc
			
 
				 fi
			
 
				-AC_ARG_WITH(mpicc, [AS_HELP_STRING([--with-mpicc=<path to mpicc>], [Path of the mpicc compiler])], [DEFAULT_MPICC=$withval])
			
 
				-AC_PATH_PROG(mpicc_path, $DEFAULT_MPICC, [no], [$simgrid_dir/bin:$PATH])
			
 
				+AC_ARG_WITH(mpicc, [AS_HELP_STRING([--with-mpicc=<mpicc name or path to mpicc>], [Name or path of the mpicc compiler])], [DEFAULT_MPICC=$withval])
			
 
				+case $DEFAULT_MPICC in
			
 
				+	/*) mpicc_path="$DEFAULT_MPICC" ;;
			
 
				+	*)  AC_PATH_PROG(mpicc_path, $DEFAULT_MPICC, [no], [$simgrid_dir/bin:$PATH]) ;;
			
 
				+esac
			
 
				+# We test if the MPICC compiler exists
			
 
				+if test ! -x $mpicc_path; then
			
 
				+    AC_MSG_RESULT(The mpicc compiler '$mpicc_path' does not have the execute permission)
			
 
				+    mpicc_path=no
			
 
				+fi
			
 
				+
			
 
				 AC_MSG_CHECKING(whether mpicc is available)
			
 
				 AC_MSG_RESULT($mpicc_path)
			
 
				 AC_SUBST(MPICC, $mpicc_path)
			
 
				 
			
 
				+if test x$mpicc_path != xno ; then
			
 
				+    MPIPATH=$(dirname $mpicc_path):$PATH
			
 
				+else
			
 
				+    MPIPATH=$PATH
			
 
				+fi
			
 
				+
			
 
				 #Check MPICXX/MPIC++
			
 
				 if test x$enable_simgrid = xyes ; then
			
 
				     DEFAULT_MPICXX=smpicxx
			
 
				 else
			
 
				     DEFAULT_MPICXX=mpicxx
			
 
				 fi
			
 
				-AC_ARG_WITH(mpicxx, [AS_HELP_STRING([--with-mpicxx=<path to mpicxx>], [Path of the mpicxx/mpic++ compiler])], [DEFAULT_MPICXX=$withval])
			
 
				-AC_PATH_PROG(mpicxx_path, $DEFAULT_MPICXX, [no], [$simgrid_dir/bin:$PATH])
			
 
				+AC_ARG_WITH(mpicxx, [AS_HELP_STRING([--with-mpicxx=<mpicxx name or path to mpicxx>], [Name or path of the mpicxx/mpic++ compiler])], [DEFAULT_MPICXX=$withval])
			
 
				+case $DEFAULT_MPICXX in
			
 
				+	/*) mpicxx_path="$DEFAULT_MPICXX" ;;
			
 
				+	*)  AC_PATH_PROG(mpicxx_path, $DEFAULT_MPICXX, [no], [$MPIPATH]) ;;
			
 
				+esac
			
 
				 
			
 
				 # try with mpic++ if mpicxx was not found
			
 
				 if test x$mpicxx_path = xno ; then
			
 
				     DEFAULT_MPICXX=mpic++
			
 
				-    AC_PATH_PROG(mpicxx_path, $DEFAULT_MPICXX, [no], [$simgrid_dir/bin:$PATH])
			
 
				+    AC_PATH_PROG(mpicxx_path, $DEFAULT_MPICXX, [no], [$MPIPATH])
			
 
				 fi
			
 
				 
			
 
				 # We test if the MPICXX/MPIC++ compiler exists
			
@@ -490,17 +508,16 @@ if test x$enable_mpi = xyes -a x$running_mpi_check = xyes ; then
 
				     # Check if mpiexec is available
			
 
				     if test x$enable_simgrid = xyes ; then
			
 
				 	DEFAULT_MPIEXEC=smpirun
			
 
				-        AC_ARG_WITH(smpirun, [AS_HELP_STRING([--with-smpirun[=<path to smpirun>]], [Path of the smpirun helper])], [DEFAULT_MPIEXEC=$withval])
			
 
				-	AC_PATH_PROG(mpiexec_path, $DEFAULT_MPIEXEC, [no], [$simgrid_dir/bin:$PATH])
			
 
				+        AC_ARG_WITH(smpirun, [AS_HELP_STRING([--with-smpirun[=<name of smpirun or path to smpirun>]], [Name or path of the smpirun helper])], [DEFAULT_MPIEXEC=$withval])
			
 
				     else
			
 
				 	DEFAULT_MPIEXEC=mpiexec
			
 
				-	AC_ARG_WITH(mpiexec, [AS_HELP_STRING([--with-mpiexec=<path to mpiexec>], [Path of mpiexec])], [DEFAULT_MPIEXEC=$withval])
			
 
				-	if test x$mpicc_path = x ; then
			
 
				-	    AC_PATH_PROG(mpiexec_path, $DEFAULT_MPIEXEC, [no], [$PATH])
			
 
				-	else
			
 
				-	    AC_PATH_PROG(mpiexec_path, $DEFAULT_MPIEXEC, [no], [$(dirname $mpicc_path):$PATH])
			
 
				-	fi
			
 
				+	AC_ARG_WITH(mpiexec, [AS_HELP_STRING([--with-mpiexec=<name of mpiexec or path to mpiexec>], [Name or path of mpiexec])], [DEFAULT_MPIEXEC=$withval])
			
 
				     fi
			
 
				+
			
 
				+    case $DEFAULT_MPIEXEC in
			
 
				+	/*) mpiexec_path="$DEFAULT_MPIEXEC" ;;
			
 
				+	*)  AC_PATH_PROG(mpiexec_path, $DEFAULT_MPIEXEC, [no], [$MPIPATH])
			
 
				+    esac
			
 
				     AC_MSG_CHECKING(whether mpiexec is available)
			
 
				     AC_MSG_RESULT($mpiexec_path)
			
 
				 
			
@@ -2193,9 +2210,9 @@ AC_MSG_RESULT($nmaxbuffers)
 
				 AC_DEFINE_UNQUOTED(STARPU_NMAXBUFS, [$nmaxbuffers],
			
 
				 		[how many buffers can be manipulated per task])
			
 
				 
			
 
				-AC_MSG_CHECKING(maximum number of nodes to use)
			
 
				+AC_MSG_CHECKING(maximum number of memory nodes to use per MPI rank)
			
 
				 AC_ARG_ENABLE(maxnodes, [AS_HELP_STRING([--enable-maxnodes=<nnodes>],
			
 
				-			[maximum number of nodes])],
			
 
				+			[maximum number of memory nodes per MPI rank])],
			
 
				 			maxnodes=$enableval, maxnodes=0)
			
 
				 
			
 
				 if test x$maxnodes = x0 ; then
			
@@ -2467,16 +2484,19 @@ enable_build_fortran=no
 
				 if test "x$enable_build_fortran_requested" = "xyes" ; then
			
 
				    if test "x$FC" != "x"; then
			
 
				    	if $FC --version|grep -q 'GNU Fortran'; then
			
 
				-		 AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
			
 
				-     	         #if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 9)
			
 
				-                 int dummy;
			
 
				-                 #else
			
 
				-                 #error GFortran too old, version >= 4.9.x needed, Fortran examples will not be built
			
 
				-                 #endif
			
 
				-                 ]],
			
 
				+		 AC_LANG_PUSH([Fortran])
			
 
				+		 OLD_FCFLAGS="$FCFLAGS"
			
 
				+		 FCFLAGS="$FCFLAGS -cpp"
			
 
				+		 AC_COMPILE_IFELSE([AC_LANG_PROGRAM([], [[
			
 
				+#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 9)
			
 
				+#error GFortran too old, version >= 4.9.x needed, Fortran examples will not be built
			
 
				+#endif
			
 
				+]]
			
 
				                  )],
			
 
				                  [enable_build_fortran="yes"],
			
 
				                  [enable_build_fortran="no"])
			
 
				+		 FCFLAGS="$OLD_FCFLAGS"
			
 
				+		 AC_LANG_POP([Fortran])
			
 
				                  if test "$enable_build_fortran" = "no" ; then
			
 
				                    AC_MSG_WARN([GFortran too old, version >= 4.9.x needed, Fortran examples will not be built])
			
 
				                  fi
			
@@ -2519,8 +2539,10 @@ if test "x$enable_build_fortran_requested" = "xyes" ; then
 
				 					      else
			
 
				 						      DEFAULT_MPIFORT=mpif90
			
 
				 					      fi
			
 
				-					      # nothing was specified: default value is used
			
 
				-					      AC_PATH_PROG(mpifort_path, $DEFAULT_MPIFORT, [no], [$(dirname $mpicc_path):$simgrid_dir/bin:$PATH])
			
 
				+					      case $DEFAULT_MPIFORT in
			
 
				+					      	/*) mpifort_path="$DEFAULT_MPIFORT" ;;
			
 
				+					        *)  AC_PATH_PROG(mpifort_path, $DEFAULT_MPIFORT, [no], [$MPIPATH])
			
 
				+					      esac
			
 
				 					      ])
			
 
				 
			
 
				 			# We test if the MPIFORT compiler exists
			
@@ -3074,10 +3096,15 @@ AC_ARG_WITH([hwloc],
 
				 				if test ! -d "$withval" ; then
			
 
				 				   AC_MSG_ERROR("Directory specified for hwloc <$withval> does not exist")
			
 
				 				fi
			
 
				-				if test ! -d "$withval/lib/pkgconfig" ; then
			
 
				-				   AC_MSG_ERROR("Hwloc directory <$withval> does not have a subdirectory lib/pkgconfig")
			
 
				+				if test -d "$withval/lib64/pkgconfig" ; then
			
 
				+				   export PKG_CONFIG_PATH=$withval/lib64/pkgconfig:$PKG_CONFIG_PATH
			
 
				+			        else
			
 
				+				   if test -d "$withval/lib/pkgconfig" ; then
			
 
				+				      export PKG_CONFIG_PATH=$withval/lib/pkgconfig:$PKG_CONFIG_PATH
			
 
				+				   else
			
 
				+				      AC_MSG_ERROR("Hwloc directory <$withval> does not have a subdirectory lib/pkgconfig or lib64/pkgconfig")
			
 
				+				   fi
			
 
				 				fi
			
 
				-				export PKG_CONFIG_PATH=$withval/lib/pkgconfig:$PKG_CONFIG_PATH
			
 
				 				use_hwloc=yes
			
 
				 			fi
			
 
				 		else
			
--- a/doc/doxygen/chapters/101_building.doxy
+++ b/doc/doxygen/chapters/101_building.doxy
@@ -43,8 +43,10 @@ can be used to install StarPU.
 
				 The <c>hwloc</c> (http://www.open-mpi.org/software/hwloc) topology
			
 
				 discovery library is not mandatory to use StarPU but strongly
			
 
				 recommended.  It allows for topology aware scheduling, which improves
			
 
				-performance.  <c>libhwloc</c> is available in major free operating system
			
 
				-distributions, and for most operating systems.
			
 
				+performance. <c>hwloc</c> is available in major free operating system
			
 
				+distributions, and for most operating systems. Make sure to not only install
			
 
				+a <c>hwloc</c> or <c>libhwloc</c> package, but also <c>hwloc-devel</c> or
			
 
				+<c>libhwloc-dev</c> so as to have hwloc headers etc.
			
 
				 
			
 
				 If <c>libhwloc</c> is installed in a standard
			
 
				 location, no option is required, it will be detected automatically,
			
--- a/doc/doxygen/chapters/320_scheduling.doxy
+++ b/doc/doxygen/chapters/320_scheduling.doxy
@@ -39,33 +39,33 @@ STARPU_SCHED. For instance <c>export STARPU_SCHED=dmda</c> . Use <c>help</c> to
 
				 get the list of available schedulers.
			
 
				 
			
 
				 
			
 
				-<b>Non Performance Modelling Policies:</b>
			
 
				+\subsection NonPerformanceModelingPolicies Non Performance Modelling Policies
			
 
				 
			
 
				-The <b>eager</b> scheduler uses a central task queue, from which all workers draw tasks
			
 
				+- The <b>eager</b> scheduler uses a central task queue, from which all workers draw tasks
			
 
				 to work on concurrently. This however does not permit to prefetch data since the scheduling
			
 
				 decision is taken late. If a task has a non-0 priority, it is put at the front of the queue.
			
 
				 
			
 
				-The <b>random</b> scheduler uses a queue per worker, and distributes tasks randomly according to assumed worker
			
 
				+- The <b>random</b> scheduler uses a queue per worker, and distributes tasks randomly according to assumed worker
			
 
				 overall performance.
			
 
				 
			
 
				-The <b>ws</b> (work stealing) scheduler uses a queue per worker, and schedules
			
 
				+- The <b>ws</b> (work stealing) scheduler uses a queue per worker, and schedules
			
 
				 a task on the worker which released it by
			
 
				 default. When a worker becomes idle, it steals a task from the most loaded
			
 
				 worker.
			
 
				 
			
 
				-The <b>lws</b> (locality work stealing) scheduler uses a queue per worker, and schedules
			
 
				+- The <b>lws</b> (locality work stealing) scheduler uses a queue per worker, and schedules
			
 
				 a task on the worker which released it by
			
 
				 default. When a worker becomes idle, it steals a task from neighbour workers. It
			
 
				 also takes into account priorities.
			
 
				 
			
 
				-The <b>prio</b> scheduler also uses a central task queue, but sorts tasks by
			
 
				+- The <b>prio</b> scheduler also uses a central task queue, but sorts tasks by
			
 
				 priority specified by the programmer (between -5 and 5).
			
 
				 
			
 
				-The <b>heteroprio</b> scheduler uses different priorities for the different processing units.
			
 
				+- The <b>heteroprio</b> scheduler uses different priorities for the different processing units.
			
 
				 This scheduler must be configured to work correclty and to expect high-performance
			
 
				 as described in the corresponding section.
			
 
				 
			
 
				-\section DMTaskSchedulingPolicy Performance Model-Based Task Scheduling Policies
			
 
				+\subsection DMTaskSchedulingPolicy Performance Model-Based Task Scheduling Policies
			
 
				 
			
 
				 If (<b>and only if</b>) your application <b>codelets have performance models</b> (\ref
			
 
				 PerformanceModelExample), you should change the scheduler thanks to the
			
@@ -87,47 +87,84 @@ family policy using performance model hints. A low or zero percentage may be
 
				 the sign that performance models are not converging or that codelets do not
			
 
				 have performance models enabled.
			
 
				 
			
 
				-<b>Performance Modelling Policies:</b>
			
 
				-
			
 
				-The <b>dm</b> (deque model) scheduler takes task execution performance models into account to
			
 
				+- The <b>dm</b> (deque model) scheduler takes task execution performance models into account to
			
 
				 perform a HEFT-similar scheduling strategy: it schedules tasks where their
			
 
				 termination time will be minimal. The difference with HEFT is that <b>dm</b>
			
 
				 schedules tasks as soon as they become available, and thus in the order they
			
 
				 become available, without taking priorities into account.
			
 
				 
			
 
				-The <b>dmda</b> (deque model data aware) scheduler is similar to dm, but it also takes
			
 
				+- The <b>dmda</b> (deque model data aware) scheduler is similar to dm, but it also takes
			
 
				 into account data transfer time.
			
 
				 
			
 
				-The <b>dmdap</b> (deque model data aware prio) scheduler is similar to dmda,
			
 
				+- The <b>dmdap</b> (deque model data aware prio) scheduler is similar to dmda,
			
 
				 except that it sorts tasks by priority order, which allows to become even closer
			
 
				 to HEFT by respecting priorities after having made the scheduling decision (but
			
 
				 it still schedules tasks in the order they become available).
			
 
				 
			
 
				-The <b>dmdar</b> (deque model data aware ready) scheduler is similar to dmda,
			
 
				+- The <b>dmdar</b> (deque model data aware ready) scheduler is similar to dmda,
			
 
				 but it also privileges tasks whose data buffers are already available
			
 
				 on the target device.
			
 
				 
			
 
				-The <b>dmdas</b> combines dmdap and dmdas: it sorts tasks by priority order,
			
 
				+- The <b>dmdas</b> combines dmdap and dmdas: it sorts tasks by priority order,
			
 
				 but for a given priority it will privilege tasks whose data buffers are already
			
 
				 available on the target device.
			
 
				 
			
 
				-The <b>dmdasd</b> (deque model data aware sorted decision) scheduler is similar
			
 
				+- The <b>dmdasd</b> (deque model data aware sorted decision) scheduler is similar
			
 
				 to dmdas, except that when scheduling a task, it takes into account its priority
			
 
				 when computing the minimum completion time, since this task may get executed
			
 
				 before others, and thus the latter should be ignored.
			
 
				 
			
 
				-The <b>heft</b> (heterogeneous earliest finish time) scheduler is a deprecated
			
 
				+- The <b>heft</b> (heterogeneous earliest finish time) scheduler is a deprecated
			
 
				 alias for <b>dmda</b>.
			
 
				 
			
 
				-The <b>pheft</b> (parallel HEFT) scheduler is similar to dmda, it also supports
			
 
				+- The <b>pheft</b> (parallel HEFT) scheduler is similar to dmda, it also supports
			
 
				 parallel tasks (still experimental). Should not be used when several contexts using
			
 
				 it are being executed simultaneously.
			
 
				 
			
 
				-The <b>peager</b> (parallel eager) scheduler is similar to eager, it also
			
 
				+- The <b>peager</b> (parallel eager) scheduler is similar to eager, it also
			
 
				 supports parallel tasks (still experimental). Should not be used when several 
			
 
				 contexts using it are being executed simultaneously.
			
 
				 
			
 
				-TODO: describe modular schedulers
			
 
				+\subsection ExistingModularizedSchedulers Modularized Schedulers
			
 
				+
			
 
				+StarPU provides a powerful way to implement schedulers, as documented in \ref
			
 
				+DefiningANewModularSchedulingPolicy . It is currently shipped with the following
			
 
				+pre-defined Modularized Schedulers :
			
 
				+
			
 
				+
			
 
				+- <b>modular-eager</b> , <b>modular-eager-prefetching</b> are eager-based Schedulers (without and with prefetching)), they are \n
			
 
				+naive schedulers, which try to map a task on the first available resource
			
 
				+they find. The prefetching variant queues several tasks in advance to be able to
			
 
				+do data prefetching. This may however degrade load balancing a bit.
			
 
				+
			
 
				+- <b>modular-prio</b>, <b>modular-prio-prefetching</b>, <b>modular-eager-prio</b> are prio-based Schedulers (without / with prefetching):,
			
 
				+similar to Eager-Based Schedulers. Can handle tasks which have a defined
			
 
				+priority and schedule them accordingly.
			
 
				+The <b>modular-eager-prio</b> variant integrates the eager and priority queue in a
			
 
				+single component. This allows it to do a better job at pushing tasks.
			
 
				+
			
 
				+- <b>modular-random</b>, <b>modular-random-prio</b>, <b>modular-random-prefetching</b>, <b>modular-random-prio-prefetching</b> are random-based Schedulers (without/with prefetching) : \n
			
 
				+Select randomly a resource to be mapped on for each task.
			
 
				+
			
 
				+- <b>modular-ws</b>) implements Work Stealing:
			
 
				+Maps tasks to workers in round robin, but allows workers to steal work from other workers.
			
 
				+
			
 
				+- <b>modular-heft</b>, <b>modular-heft2</b>, and <b>modular-heft-prio</b> are
			
 
				+HEFT Schedulers : \n
			
 
				+Maps tasks to workers using a heuristic very close to
			
 
				+Heterogeneous Earliest Finish Time.
			
 
				+It needs that every task submitted to StarPU have a
			
 
				+defined performance model (\ref PerformanceModelCalibration)
			
 
				+to work efficiently, but can handle tasks without a performance
			
 
				+model. <b>modular-heft</b> just takes tasks by priority order. <b>modular-heft2</b> takes
			
 
				+at most 5 tasks of the same priority and checks which one fits best.
			
 
				+<b>modular-heft-prio</b> is similar to <b>modular-heft</b>, but only decides the memory
			
 
				+node, not the exact worker, just pushing tasks to one central queue per memory
			
 
				+node.
			
 
				+
			
 
				+- <b>modular-heteroprio</b> is a Heteroprio Scheduler: \n
			
 
				+Maps tasks to worker similarly to HEFT, but first attribute accelerated tasks to
			
 
				+GPUs, then not-so-accelerated tasks to CPUs.
			
 
				 
			
 
				 \section TaskDistributionVsDataTransfer Task Distribution Vs Data Transfer
			
 
				 
			
@@ -198,51 +235,6 @@ use starpu_task_expected_length() on the task (in µs), multiplied by the
 
				 typical power consumption of the device, e.g. in W, and divided by 1000000. to
			
 
				 get Joules.
			
 
				 
			
 
				-\section ExistingModularizedSchedulers Modularized Schedulers
			
 
				-
			
 
				-StarPU provides a powerful way to implement schedulers, as documented in \ref
			
 
				-DefiningANewModularSchedulingPolicy . It is currently shipped with the following
			
 
				-pre-defined Modularized Schedulers :
			
 
				-
			
 
				-- Eager-based Schedulers (with/without prefetching : \c modular-eager ,
			
 
				-\c modular-eager-prefetching) : \n
			
 
				-Naive scheduler, which tries to map a task on the first available resource
			
 
				-it finds. The prefecthing variant queues several tasks in advance to be able to
			
 
				-do data prefetching. This may however degrade load balancing a bit.
			
 
				-
			
 
				-- Prio-based Schedulers (with/without prefetching :
			
 
				-\c modular-prio, \c modular-prio-prefetching , \c modular-eager-prio) : \n
			
 
				-Similar to Eager-Based Schedulers. Can handle tasks which have a defined
			
 
				-priority and schedule them accordingly.
			
 
				-The \c modular-eager-prio variant integrates the eager and priority queue in a
			
 
				-single component. This allows it to do a better job at pushing tasks.
			
 
				-
			
 
				-- Random-based Schedulers (with/without prefetching: \c modular-random,
			
 
				-\c modular-random-prio, \c modular-random-prefetching, \c
			
 
				-modular-random-prio-prefetching) : \n
			
 
				-Selects randomly a resource to be mapped on for each task.
			
 
				-
			
 
				-- Work Stealing (\c modular-ws) : \n
			
 
				-Maps tasks to workers in round robin, but allows workers to steal work from other workers.
			
 
				-
			
 
				-- HEFT Scheduler : \n
			
 
				-Maps tasks to workers using a heuristic very close to
			
 
				-Heterogeneous Earliest Finish Time.
			
 
				-It needs that every task submitted to StarPU have a
			
 
				-defined performance model (\ref PerformanceModelCalibration)
			
 
				-to work efficiently, but can handle tasks without a performance
			
 
				-model. \c modular-heft just takes tasks by priority order. \c modular-heft takes
			
 
				-at most 5 tasks of the same priority and checks which one fits best. \c
			
 
				-modular-heft-prio is similar to \c modular-heft, but only decides the memory
			
 
				-node, not the exact worker, just pushing tasks to one central queue per memory
			
 
				-node.
			
 
				-
			
 
				-- Heteroprio Scheduler: \n
			
 
				-Maps tasks to worker similarly to HEFT, but first attribute accelerated tasks to
			
 
				-GPUs, then not-so-accelerated tasks to CPUs.
			
 
				-
			
 
				-To use one of these schedulers, one can set the environment variable \ref STARPU_SCHED.
			
 
				-
			
 
				 \section StaticScheduling Static Scheduling
			
 
				 
			
 
				 In some cases, one may want to force some scheduling, for instance force a given
			
--- a/doc/doxygen/chapters/410_mpi_support.doxy
+++ b/doc/doxygen/chapters/410_mpi_support.doxy
@@ -34,6 +34,22 @@ An MPI Insert Task function provides an even more seamless transition to a
 
				 distributed application, by automatically issuing all required data transfers
			
 
				 according to the task graph and an application-provided distribution.
			
 
				 
			
 
				+\section MPIBuild Building with MPI support
			
 
				+
			
 
				+If a <c>mpicc</c> compiler is already in your PATH, StarPU will automatically
			
 
				+enable MPI support in the build. If <c>mpicc</c> is not in PATH, you
			
 
				+can specify its location by passing <c>--with-mpicc=/where/there/is/mpicc</c> to
			
 
				+<c>./configure</c>
			
 
				+
			
 
				+It can be useful to enable MPI tests during <c>make check</c> by passing
			
 
				+<c>--enable-mpi-check</c> to <c>./configure</c>. And similarly to
			
 
				+<c>mpicc</c>, if <c>mpiexec</c> in not in PATH, you can specify its location by passing
			
 
				+<c>--with-mpiexec=/where/there/is/mpiexec</c> to <c>./configure</c>, but this is
			
 
				+not needed if it is next to <c>mpicc</c>, configure will look there in addition to PATH.
			
 
				+
			
 
				+Similarly, Fortran examples use <c>mpif90</c>, which can be specified manually
			
 
				+with <c>--with-mpifort</c> if it can't be found automatically.
			
 
				+
			
 
				 \section ExampleDocumentation Example Used In This Documentation
			
 
				 
			
 
				 The example below will be used as the base for this documentation. It
			
--- a/julia/StarPU.jl/src/StarPU.jl
+++ b/julia/StarPU.jl/src/StarPU.jl
@@ -30,16 +30,11 @@ macro starpucall(func, ret_type, arg_types, args...)
 
				     return Expr(:call, :ccall, (func, starpu_task_library_name), esc(ret_type), esc(arg_types), map(esc, args)...)
			
 
				 end
			
 
				 
			
 
				-export @debugprint
			
 
				-macro debugprint(x...)
			
 
				-    quote
			
 
				-        println("\x1b[32m", $x..., "\x1b[0m")
			
 
				-        flush(stdout)
			
 
				-    end
			
 
				+function debug_print(x...)
			
 
				+    println("\x1b[32m", x..., "\x1b[0m")
			
 
				+    flush(stdout)
			
 
				 end
			
 
				 
			
 
				-
			
 
				-
			
 
				 function Cstring_from_String(str :: String)
			
 
				     return Cstring(pointer(str))
			
 
				 end
			
@@ -473,7 +468,7 @@ mutable struct StarpuTask
 
				     handles :: Vector{StarpuDataHandle}
			
 
				     handle_pointers :: Vector{StarpuDataHandlePointer}
			
 
				     synchronous :: Bool
			
 
				-    cl_arg :: Union{Ref, Cvoid}
			
 
				+    cl_arg # type depends on codelet
			
 
				 
			
 
				     c_task :: Ptr{Cvoid}
			
 
				 
			
@@ -483,7 +478,7 @@ mutable struct StarpuTask
 
				 
			
 
				         Creates a new task which will run the specified codelet on handle buffers and cl_args data
			
 
				     """
			
 
				-    function StarpuTask(; cl :: Union{Cvoid, StarpuCodelet} = nothing, handles :: Vector{StarpuDataHandle} = StarpuDataHandle[], cl_arg :: Union{Ref, Cvoid} = nothing)
			
 
				+    function StarpuTask(; cl :: Union{Cvoid, StarpuCodelet} = nothing, handles :: Vector{StarpuDataHandle} = StarpuDataHandle[], cl_arg = [])
			
 
				 
			
 
				         if (cl == nothing)
			
 
				             error("\"cl\" field can't be empty when creating a StarpuTask")
			
@@ -493,7 +488,29 @@ mutable struct StarpuTask
 
				 
			
 
				         output.cl = cl
			
 
				         output.handles = handles
			
 
				-        output.cl_arg = cl_arg
			
 
				+
			
 
				+        # handle scalar_parameters
			
 
				+        codelet_name = cl.cpu_func
			
 
				+        if isempty(codelet_name)
			
 
				+            codelet_name = cl.cuda_func
			
 
				+        end
			
 
				+        if isempty(codelet_name)
			
 
				+            codelet_name = cl.opencl_func
			
 
				+        end
			
 
				+        if isempty(codelet_name)
			
 
				+            error("No function provided with codelet.")
			
 
				+        end
			
 
				+        scalar_parameters = get(CODELETS_SCALARS, codelet_name, nothing)
			
 
				+        if scalar_parameters != nothing
			
 
				+            nb_scalar_required = length(scalar_parameters)
			
 
				+            nb_scalar_provided = length(cl_arg)
			
 
				+            if (nb_scalar_provided != nb_scalar_required)
			
 
				+                error("$nb_scalar_provided scalar parameters provided but $nb_scalar_required are required by $codelet_name.")
			
 
				+            end
			
 
				+            output.cl_arg = create_param_struct_from_clarg(codelet_name, cl_arg)
			
 
				+        else
			
 
				+            output.cl_arg = nothing
			
 
				+        end
			
 
				 
			
 
				         output.synchronous = false
			
 
				         output.handle_pointers = StarpuDataHandlePointer[]
			
@@ -513,6 +530,28 @@ mutable struct StarpuTask
 
				 
			
 
				 end
			
 
				 
			
 
				+function create_param_struct_from_clarg(name, cl_arg)
			
 
				+    struct_params_name = CODELETS_PARAMS_STRUCT[name]
			
 
				+
			
 
				+    if struct_params_name == false
			
 
				+        error("structure name not found in CODELET_PARAMS_STRUCT")
			
 
				+    end
			
 
				+
			
 
				+    nb_scalar_provided = length(cl_arg)
			
 
				+    create_struct_param_str = "output = $struct_params_name("
			
 
				+    for i in 1:nb_scalar_provided-1
			
 
				+        arg = cl_arg[i]
			
 
				+        create_struct_param_str *= "$arg, "
			
 
				+        end
			
 
				+    if (nb_scalar_provided > 0)
			
 
				+        arg = cl_arg[nb_scalar_provided]
			
 
				+        create_struct_param_str *= "$arg"
			
 
				+    end
			
 
				+    create_struct_param_str *= ")"
			
 
				+    eval(Meta.parse(create_struct_param_str))
			
 
				+    return output
			
 
				+end
			
 
				+
			
 
				 """
			
 
				     Structure used to update fields of the real C task structure 
			
 
				 """
			
@@ -539,8 +578,8 @@ mutable struct StarpuTaskTranslator
 
				             output.cl_arg = C_NULL
			
 
				             output.cl_arg_size = 0
			
 
				         else
			
 
				-            output.cl_arg = pointer_from_objref(task.cl_arg) #TODO : Libc.malloc and cl_arg_free set to 1 ? but it should be done only when submitting
			
 
				-            output.cl_arg_size = sizeof(eltype(task.cl_arg))
			
 
				+            output.cl_arg = pointer_from_objref(task.cl_arg)
			
 
				+            output.cl_arg_size = sizeof(task.cl_arg)
			
 
				         end
			
 
				 
			
 
				         return output
			
@@ -575,9 +614,11 @@ end
 
				     cpu and gpu function names
			
 
				 """
			
 
				 function starpu_init()
			
 
				+    debug_print("starpu_init")
			
 
				+
			
 
				     if (get(ENV,"JULIA_TASK_LIB",0)!=0)
			
 
				         global starpu_tasks_library_handle= Libdl.dlopen(ENV["JULIA_TASK_LIB"])
			
 
				-        @debugprint "Loading external codelet library"
			
 
				+        debug_print("Loading external codelet library")
			
 
				         ff = Libdl.dlsym(starpu_tasks_library_handle,:starpu_find_function)
			
 
				         dump(ff)
			
 
				         for k in keys(CUDA_CODELETS)
			
@@ -585,7 +626,7 @@ function starpu_init()
 
				             print(k,">>>>",CPU_CODELETS[k],"\n")
			
 
				         end
			
 
				     else
			
 
				-        @debugprint "generating codelet library"
			
 
				+        debug_print("generating codelet library")
			
 
				         run(`make generated_tasks.so`);
			
 
				         global starpu_tasks_library_handle=Libdl.dlopen("generated_tasks.so")
			
 
				     end
			
@@ -600,6 +641,8 @@ end
 
				     Must be called at the end of the program
			
 
				 """
			
 
				 function starpu_shutdown()
			
 
				+    debug_print("starpu_shutdown")
			
 
				+
			
 
				     starpu_exit_block()
			
 
				     @starpucall starpu_shutdown Cvoid ()
			
 
				     jlstarpu_free_allocated_structures()
			
@@ -815,7 +858,7 @@ end
 
				     Creates and submits an asynchronous task running cl Codelet function.
			
 
				     Ex : @starpu_async_cl cl(handle1, handle2)
			
 
				 """
			
 
				-macro starpu_async_cl(expr,modes)
			
 
				+macro starpu_async_cl(expr,modes,cl_arg=[])
			
 
				 
			
 
				     if (!isa(expr, Expr) || expr.head != :call)
			
 
				         error("Invalid task submit syntax")
			
@@ -830,7 +873,7 @@ macro starpu_async_cl(expr,modes)
 
				     println(CPU_CODELETS[string(expr.args[1])])
			
 
				     cl = StarpuCodelet(
			
 
				         cpu_func = CPU_CODELETS[string(expr.args[1])],
			
 
				-        #cuda_func = "matrix_mult",
			
 
				+        # cuda_func = CUDA_CODELETS[string(expr.args[1])],
			
 
				         #opencl_func="ocl_matrix_mult",
			
 
				         ### TODO: CORRECT !
			
 
				         modes = map((x -> starpu_modes(x)),modes.args),
			
@@ -839,7 +882,7 @@ macro starpu_async_cl(expr,modes)
 
				     handles = Expr(:vect, expr.args[2:end]...)
			
 
				     #dump(handles)
			
 
				     quote
			
 
				-        task = StarpuTask(cl = $(esc(cl)), handles = $(esc(handles)))
			
 
				+        task = StarpuTask(cl = $(esc(cl)), handles = $(esc(handles)), cl_arg=$(esc(cl_arg)))
			
 
				         starpu_task_submit(task)
			
 
				     end
			
 
				 end
			
--- a/julia/StarPU.jl/src/compiler/c.jl
+++ b/julia/StarPU.jl/src/compiler/c.jl
@@ -1,5 +1,3 @@
 
				-
			
 
				-
			
 
				 """
			
 
				     Returns the list of instruction that will be added before for loop of shape
			
 
				         "for for_index_var in set ..."
			
@@ -56,12 +54,7 @@ function add_for_loop_declarations(expr :: StarpuExpr)
 
				     return apply(func_to_apply, expr)
			
 
				 end
			
 
				 
			
 
				-
			
 
				-
			
 
				-
			
 
				-
			
 
				 function transform_to_cpu_kernel(expr :: StarpuExprFunction)
			
 
				-
			
 
				     output = add_for_loop_declarations(expr)
			
 
				     output = substitute_args(output)
			
 
				     output = substitute_func_calls(output)
			
@@ -71,7 +64,20 @@ function transform_to_cpu_kernel(expr :: StarpuExprFunction)
 
				     return output
			
 
				 end
			
 
				 
			
 
				+function generate_c_struct_param_declaration(funcname)
			
 
				+    scalar_parameters = CODELETS_SCALARS[funcname]
			
 
				+    struct_params_name = CODELETS_PARAMS_STRUCT[funcname]
			
 
				+
			
 
				+    output = "struct $struct_params_name {\n"
			
 
				+    for p in scalar_parameters
			
 
				+        arg_name = p[1]
			
 
				+        arg_type = p[2]
			
 
				+        output *= "\t" * starpu_type_traduction(arg_type) * " $arg_name;\n"
			
 
				+    end
			
 
				+    output *= "};\n\n"
			
 
				 
			
 
				+    return output
			
 
				+end
			
 
				 
			
 
				 function flatten_blocks(expr :: StarpuExpr)
			
 
				 
			
@@ -130,46 +136,62 @@ end
 
				 
			
 
				 
			
 
				 function substitute_args(expr :: StarpuExprFunction)
			
 
				-
			
 
				     new_body = expr.body
			
 
				     func_id = rand_string()
			
 
				     buffer_arg_name = Symbol("buffers_", func_id)
			
 
				     cl_arg_name = Symbol("cl_arg_", func_id)
			
 
				-    post = false
			
 
				     function_start_affectations = StarpuExpr[]
			
 
				 
			
 
				+    buffer_id = 1
			
 
				+    scalar_id = 1
			
 
				+
			
 
				+    # get scalar parameters and structure name
			
 
				+    scalar_parameters = CODELETS_SCALARS[string(expr.func)]
			
 
				+    struct_params_name = CODELETS_PARAMS_STRUCT[string(expr.func)]
			
 
				+
			
 
				     for i in (1 : length(expr.args))
			
 
				 
			
 
				         var_id = rand_string()
			
 
				         ptr = Symbol(:ptr_, var_id)
			
 
				         var_name = ptr
			
 
				-        
			
 
				+
			
 
				         if (expr.args[i].typ <: Vector)
			
 
				             func_interface = :STARPU_VECTOR_GET_PTR
			
 
				+            type_in_arg = eltype(expr.args[i].typ)
			
 
				+            new_affect = starpu_parse( :($ptr :: Ptr{$type_in_arg} = $func_interface($buffer_arg_name[$buffer_id])) )
			
 
				+            push!(function_start_affectations, new_affect)
			
 
				+            new_body = substitute_argument_usage(new_body, buffer_id, buffer_arg_name, expr.args[i].name, var_name)
			
 
				+            buffer_id += 1
			
 
				         elseif (expr.args[i].typ <: Matrix)
			
 
				             func_interface = :STARPU_MATRIX_GET_PTR
			
 
				             ld_name = Symbol("ld_", var_id)
			
 
				-            post_affect = starpu_parse( :($ld_name :: UInt32 = STARPU_MATRIX_GET_LD($buffer_arg_name[$i])) )
			
 
				-            post=true
			
 
				-            
			
 
				-        elseif (expr.args[i].typ <: Float32)
			
 
				+            post_affect = starpu_parse( :($ld_name :: UInt32 = STARPU_MATRIX_GET_LD($buffer_arg_name[$buffer_id])) )
			
 
				+            type_in_arg = eltype(expr.args[i].typ)
			
 
				+            new_affect = starpu_parse( :($ptr :: Ptr{$type_in_arg} = $func_interface($buffer_arg_name[$buffer_id])) )
			
 
				+            push!(function_start_affectations, new_affect)
			
 
				+            push!(function_start_affectations, post_affect)
			
 
				+            new_body = substitute_argument_usage(new_body, buffer_id, buffer_arg_name, expr.args[i].name, var_name)
			
 
				+            buffer_id += 1
			
 
				+        elseif (expr.args[i].typ <: Ref)
			
 
				             func_interface = :STARPU_VARIABLE_GET_PTR
			
 
				-            var_name = Symbol("scal_", var_id)
			
 
				-            post_affect = starpu_parse( :($var_name :: Float32 = ($ptr[0])) )
			
 
				-            post = true
			
 
				-            
			
 
				-        end
			
 
				-        #else
			
 
				-            #error("Task arguments must be either vector or matrix (got $(expr.args[i].typ))") #TODO : cl_args, variable ?
			
 
				-        #end
			
 
				-
			
 
				-        type_in_arg = eltype(expr.args[i].typ)
			
 
				-        new_affect = starpu_parse( :($ptr :: Ptr{$type_in_arg} = $func_interface($buffer_arg_name[$i])) )
			
 
				-        push!(function_start_affectations, new_affect)
			
 
				-        if (post)
			
 
				+            type_in_arg = eltype(expr.args[i].typ)
			
 
				+            new_affect = starpu_parse( :($ptr :: Ptr{$type_in_arg} = $func_interface($buffer_arg_name[$buffer_id])) )
			
 
				+            push!(function_start_affectations, new_affect)
			
 
				+            new_body = substitute_argument_usage(new_body, buffer_id, buffer_arg_name, expr.args[i].name, Symbol("(*$var_name)"))
			
 
				+            buffer_id += 1
			
 
				+        elseif (expr.args[i].typ <: Number || expr.args[i].typ <: AbstractChar)
			
 
				+            type_in_arg = eltype(expr.args[i].typ)
			
 
				+            field_name = scalar_parameters[scalar_id][1]
			
 
				+            var_name = field_name
			
 
				+            post_affect = starpu_parse( :($var_name :: $type_in_arg = *($ptr).$field_name))
			
 
				+            new_affect = starpu_parse( :($ptr :: Ptr{$struct_params_name} = $cl_arg_name))
			
 
				+            push!(function_start_affectations, new_affect)
			
 
				             push!(function_start_affectations, post_affect)
			
 
				+            scalar_id += 1
			
 
				+        else
			
 
				+            error("Task arguments must be either matrix, vector, ref or scalar (got $(expr.args[i].typ))")
			
 
				         end
			
 
				-        new_body = substitute_argument_usage(new_body, i, buffer_arg_name, expr.args[i].name, var_name)
			
 
				+
			
 
				 
			
 
				     end
			
 
				 
			
@@ -183,8 +205,6 @@ function substitute_args(expr :: StarpuExprFunction)
 
				     return StarpuExprFunction(expr.ret_type, expr.func, new_args, new_body)
			
 
				 end
			
 
				 
			
 
				-
			
 
				-
			
 
				 func_substitution = Dict(
			
 
				     :width => :STARPU_MATRIX_GET_NY,
			
 
				     :height => :STARPU_MATRIX_GET_NX,
			
--- a/julia/StarPU.jl/src/compiler/expressions.jl
+++ b/julia/StarPU.jl/src/compiler/expressions.jl
@@ -1,3 +1,14 @@
 
				+global starpu_type_traduction_dict = Dict(
			
 
				+    Int32 => "int32_t",
			
 
				+    UInt32 => "uint32_t",
			
 
				+    Float32 => "float",
			
 
				+    Int64 => "int64_t",
			
 
				+    UInt64 => "uint64_t",
			
 
				+    Float64 => "double",
			
 
				+    Nothing => "void"
			
 
				+)
			
 
				+export starpu_type_traduction_dict
			
 
				+
			
 
				 
			
 
				 #======================================================
			
 
				                 AFFECTATION
			
@@ -841,22 +852,6 @@ function starpu_parse_typed(x :: Expr)
 
				     return StarpuExprTypedExpr(expr, typ)
			
 
				 end
			
 
				 
			
 
				-
			
 
				-
			
 
				-
			
 
				-
			
 
				-starpu_type_traduction_dict = Dict(
			
 
				-    Int32 => "int32_t",
			
 
				-    UInt32 => "uint32_t",
			
 
				-    Float32 => "float",
			
 
				-    Int64 => "int64_t",
			
 
				-    UInt64 => "uint64_t",
			
 
				-    Float64 => "double",
			
 
				-    Nothing => "void"
			
 
				-)
			
 
				-
			
 
				-
			
 
				-
			
 
				 function starpu_type_traduction(x)
			
 
				     if x <: Array
			
 
				         return starpu_type_traduction_array(x)
			
--- a/julia/StarPU.jl/src/compiler/file_generation.jl
+++ b/julia/StarPU.jl/src/compiler/file_generation.jl
@@ -95,40 +95,79 @@ global CPU_CODELETS=Dict{String,String}()
 
				 export CUDA_CODELETS
			
 
				 global CUDA_CODELETS=Dict{String,String}()
			
 
				 
			
 
				+export CODELETS_SCALARS
			
 
				+global CODELETS_SCALARS=Dict{String,Any}()
			
 
				+export CODELETS_PARAMS_STRUCT
			
 
				+global CODELETS_PARAMS_STRUCT=Dict{String,Any}()
			
 
				+
			
 
				 """
			
 
				 	    Executes @cuda_kernel and @cpu_kernel
			
 
				         """
			
 
				 macro codelet(x)
			
 
				     parsed = starpu_parse(x)
			
 
				     name=string(x.args[1].args[1].args[1]);
			
 
				+    cpu_name = name
			
 
				+    cuda_name = "CUDA_"*name
			
 
				     dump(name)
			
 
				+    parse_scalar_parameters(parsed, cpu_name, cuda_name)
			
 
				+    c_struct_param_decl = generate_c_struct_param_declaration(name)
			
 
				     cpu_expr = transform_to_cpu_kernel(parsed)
			
 
				-    prekernel, kernel = transform_to_cuda_kernel(parsed)
			
 
				+
			
 
				+    if (starpu_target & STARPU_CUDA != 0)
			
 
				+        prekernel, kernel = transform_to_cuda_kernel(parsed)
			
 
				+    end
			
 
				+
			
 
				     generated_cpu_kernel_file_name=string("genc_",string(x.args[1].args[1].args[1]),".c")
			
 
				     generated_cuda_kernel_file_name=string("gencuda_",string(x.args[1].args[1].args[1]),".cu")
			
 
				-    targets=starpu_target
			
 
				-    return quote
			
 
				-        
			
 
				-        if ($targets&$STARPU_CPU!=0)
			
 
				-            kernel_file = open($(esc(generated_cpu_kernel_file_name)), "w")
			
 
				-            @debugprint "generating " $(generated_cpu_kernel_file_name)
			
 
				-            print(kernel_file, $(esc(cpu_kernel_file_start)))
			
 
				-            print(kernel_file, $cpu_expr)
			
 
				-            close(kernel_file)
			
 
				-            CPU_CODELETS[$name]=$name
			
 
				-        end
			
 
				-        
			
 
				-        if ($targets&$STARPU_CUDA!=0)
			
 
				-            kernel_file = open($(esc(generated_cuda_kernel_file_name)), "w")
			
 
				-            @debugprint "generating " $(generated_cuda_kernel_file_name)
			
 
				-            print(kernel_file, $(esc(cuda_kernel_file_start)))
			
 
				-            print(kernel_file, "__global__ ", $kernel)
			
 
				-            print(kernel_file, "\nextern \"C\" ", $prekernel)
			
 
				-            close(kernel_file)
			
 
				-            CUDA_CODELETS[$name]="CUDA_"*$name
			
 
				+
			
 
				+    if (starpu_target & STARPU_CPU != 0)
			
 
				+        kernel_file = open(generated_cpu_kernel_file_name, "w")
			
 
				+        debug_print("generating ", generated_cpu_kernel_file_name)
			
 
				+        print(kernel_file, cpu_kernel_file_start)
			
 
				+        print(kernel_file, c_struct_param_decl)
			
 
				+        print(kernel_file, cpu_expr)
			
 
				+        close(kernel_file)
			
 
				+        CPU_CODELETS[name]=cpu_name
			
 
				+    end
			
 
				+
			
 
				+    if starpu_target & STARPU_CUDA!=0
			
 
				+        kernel_file = open(generated_cuda_kernel_file_name, "w")
			
 
				+        debug_print("generating ", generated_cuda_kernel_file_name)
			
 
				+        print(kernel_file, cuda_kernel_file_start)
			
 
				+        print(kernel_file, "__global__ ", kernel)
			
 
				+        print(kernel_file, c_struct_param_decl)
			
 
				+        print(kernel_file, "\nextern \"C\" ", prekernel)
			
 
				+        close(kernel_file)
			
 
				+        CUDA_CODELETS[name]=cuda_name
			
 
				+    end
			
 
				+end
			
 
				+
			
 
				+function parse_scalar_parameters(expr :: StarpuExprFunction, cpu_name::String, cuda_name::String)
			
 
				+    scalar_parameters = []
			
 
				+    for i in (1 : length(expr.args))
			
 
				+        type = expr.args[i].typ
			
 
				+        if (type <: Number || type <: AbstractChar)
			
 
				+            push!(scalar_parameters, (expr.args[i].name, type))
			
 
				         end
			
 
				-        print("end generation")
			
 
				-        #starpu_task_library_name="generated_tasks"
			
 
				-        #global starpu_task_library_name
			
 
				     end
			
 
				+
			
 
				+    CODELETS_SCALARS[cpu_name] = scalar_parameters
			
 
				+    CODELETS_SCALARS[cuda_name] = scalar_parameters
			
 
				+
			
 
				+    # declare structure carrying scalar parameters
			
 
				+    struct_params_name = Symbol("params_", rand_string())
			
 
				+    structure_decl_str = "mutable struct " * "$struct_params_name\n"
			
 
				+    for p in scalar_parameters
			
 
				+        structure_decl_str *= "$(p[1])::$(p[2])\n"
			
 
				+    end
			
 
				+    structure_decl_str *= "end"
			
 
				+    eval(Meta.parse(structure_decl_str))
			
 
				+
			
 
				+    # add structure type to dictionnary
			
 
				+    add_to_dict_str = "starpu_type_traduction_dict[$struct_params_name] = \"struct $struct_params_name\""
			
 
				+    eval(Meta.parse(add_to_dict_str))
			
 
				+
			
 
				+    # save structure name
			
 
				+    CODELETS_PARAMS_STRUCT[cpu_name] = struct_params_name
			
 
				+    CODELETS_PARAMS_STRUCT[cuda_name] = struct_params_name
			
 
				 end
			
--- a/julia/black_scholes/black_scholes.jl
+++ b/julia/black_scholes/black_scholes.jl
@@ -115,8 +115,6 @@ using StarPU
 
				     return 0
			
 
				 end
			
 
				 
			
 
				-
			
 
				-@debugprint "starpu_init"
			
 
				 starpu_init()
			
 
				 
			
 
				 function black_scholes_starpu(data ::Matrix{Float64}, res ::Matrix{Float64}, nslices ::Int64)
			
--- a/julia/mandelbrot/Makefile
+++ b/julia/mandelbrot/Makefile
@@ -0,0 +1,58 @@
 
				+CC=gcc
			
 
				+NVCC=nvcc
			
 
				+ENABLE_CUDA=no
			
 
				+LD=$(CC)
			
 
				+
			
 
				+ifeq ($(ENABLE_CUDA),yes)
			
 
				+        LD := ${NVCC}
			
 
				+endif
			
 
				+
			
 
				+CFLAGS = -O3 -g $(shell pkg-config --cflags starpu-1.3)
			
 
				+CPU_CFLAGS = ${CFLAGS} -Wall -mavx -fomit-frame-pointer -march=native -ffast-math
			
 
				+CUDA_CFLAGS = ${CFLAGS}
			
 
				+LDFLAGS +=$(shell pkg-config --libs starpu-1.3)
			
 
				+
			
 
				+EXTERNLIB=extern_tasks.so
			
 
				+GENERATEDLIB=generated_tasks.so
			
 
				+
			
 
				+C_OBJECTS=$(patsubst %.c,%.o,$(wildcard gen*.c))
			
 
				+CUDA_OBJECTS=$(patsubst %.cu,%.o,$(wildcard gen*.cu))
			
 
				+ifneq ($(ENABLE_CUDA),yes)
			
 
				+	CUDA_OBJECTS:=
			
 
				+endif
			
 
				+
			
 
				+LIBPATH=${PWD}/../StarPU.jl/lib
			
 
				+
			
 
				+all: ${EXTERNLIB}
			
 
				+
			
 
				+mandelbrot: mandelbrot.c cpu_mandelbrot.o #gpu_mandelbrot.o
			
 
				+	$(CC) $(CPU_CFLAGS) $^ -o $@ $(LDFLAGS)
			
 
				+
			
 
				+%.o: %.c
			
 
				+	$(CC) -c -fPIC $(CPU_CFLAGS) $^ -o $@
			
 
				+
			
 
				+%.o: %.cu
			
 
				+	$(NVCC) -dc $(CUDA_CFLAGS) $^ --shared --compiler-options '-fPIC' -o $@ $(LDFLAGS)
			
 
				+
			
 
				+${EXTERNLIB}: cpu_mandelbrot.c
			
 
				+	$(CC) $(CFLAGS) -shared -fPIC $(LDFLAGS) $^ -o $@
			
 
				+
			
 
				+${GENERATEDLIB}: $(C_OBJECTS) $(CUDA_OBJECTS)
			
 
				+	$(LD) -shared $(LDFLAGS) $^ -o $@
			
 
				+
			
 
				+.PHONY: clean
			
 
				+
			
 
				+clean:
			
 
				+	rm -f mandelbrot *.so *.o genc_*.c gencuda_*.cu *.dat
			
 
				+
			
 
				+# Performance Tests
			
 
				+cstarpu.dat: mandelbrot
			
 
				+	STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 ./mandelbrot -0.800671 -0.158392 32 32 4096 4 > $@
			
 
				+julia_generatedc.dat:
			
 
				+	LD_LIBRARY_PATH+=${LIBPATH} STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 julia mandelbrot.jl $@
			
 
				+julia_native.dat:
			
 
				+	LD_LIBRARY_PATH+=${LIBPATH} STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 julia mandelbrot_native.jl $@
			
 
				+julia_calllib.dat: ${EXTERNLIB}
			
 
				+	LD_LIBRARY_PATH+=${LIBPATH} JULIA_TASK_LIB="${EXTERNLIB}" STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 julia mandelbrot.jl julia_calllib.dat
			
 
				+
			
 
				+test: cstarpu.dat julia_generatedc.dat julia_native.dat julia_calllib.dat
			
--- a/julia/mandelbrot/cpu_mandelbrot.c
+++ b/julia/mandelbrot/cpu_mandelbrot.c
@@ -4,46 +4,50 @@
 
				 
			
 
				 void cpu_mandelbrot(void *descr[], void *cl_arg)
			
 
				 {
			
 
				-        long long int *pixels;
			
 
				-	float *params;
			
 
				+        long long *pixels;
			
 
				+        float *params;
			
 
				 
			
 
				         pixels = (long long int *)STARPU_MATRIX_GET_PTR(descr[0]);
			
 
				-	params = (float *)STARPU_MATRIX_GET_PTR(descr[1]);
			
 
				-
			
 
				-        int width = STARPU_MATRIX_GET_NX(descr[0]);
			
 
				-        int height = STARPU_MATRIX_GET_NY(descr[0]);
			
 
				-        
			
 
				-        int ldP = STARPU_MATRIX_GET_LD(descr[0]);
			
 
				+        params = (float *)STARPU_MATRIX_GET_PTR(descr[1]);
			
 
				 
			
 
				+        long long width = STARPU_MATRIX_GET_NY(descr[0]);
			
 
				+        long long height = STARPU_MATRIX_GET_NX(descr[0]);
			
 
				+        double zoom = width * 0.25296875;
			
 
				+        double iz = 1. / zoom;
			
 
				+        float diverge = 4.0;
			
 
				+        float max_iterations = (width/2) * 0.049715909 * log10(zoom);
			
 
				+        float imi = 1. / max_iterations;
			
 
				         float centerr = params[0];
			
 
				         float centeri = params[1];
			
 
				         float offset = params[2];
			
 
				         float dim = params[3];
			
 
				-        float zoom = width * 0.25296875;
			
 
				-        float diverge = 4.0;
			
 
				-        int max_iter = (width/2) * 0.049715909 * log10(zoom);
			
 
				+        double cr = 0;
			
 
				+        double zr = 0;
			
 
				+        double ci = 0;
			
 
				+        double zi = 0;
			
 
				+        long long n = 0;
			
 
				+        double tmp = 0;
			
 
				+        int ldP = STARPU_MATRIX_GET_LD(descr[0]);
			
 
				 
			
 
				-        int x,y,n;
			
 
				+        long long x,y;
			
 
				 
			
 
				         for (y = 0; y < height; y++){
			
 
				                 for (x = 0; x < width; x++){
			
 
				-                        float cr = centerr + (x - (dim/2))/zoom;
			
 
				-                        float ci = centeri + (y+offset - (dim/2))/zoom;
			
 
				-                        float zr = cr;
			
 
				-                        float zi = ci;
			
 
				-                        
			
 
				-                        for (n = 0; n <= max_iter; n++) {
			
 
				+                        cr = centerr + (x - (dim/2)) * iz;
			
 
				+			zr = cr;
			
 
				+                        ci = centeri + (y+offset - (dim/2)) * iz;
			
 
				+                        zi = ci;
			
 
				+
			
 
				+                        for (n = 0; n <= max_iterations; n++) {
			
 
				 				if (zr*zr + zi*zi>diverge) break;
			
 
				-                                float tmp = zr*zr - zi*zi + cr;
			
 
				+                                tmp = zr*zr - zi*zi + cr;
			
 
				                                 zi = 2*zr*zi + ci;
			
 
				                                 zr = tmp;
			
 
				                         }
			
 
				-			int color;
			
 
				-			if (n<max_iter)
			
 
				-				color = round(15.*n/max_iter);
			
 
				+			if (n<max_iterations)
			
 
				+				pixels[y +x*ldP] = round(15.*n*imi);
			
 
				 			else
			
 
				-				color = 0;
			
 
				-			pixels[x*ldP + y] = color;
			
 
				+				pixels[y +x*ldP] = 0;
			
 
				 		}
			
 
				 	}
			
 
				 }
			
--- a/julia/mandelbrot/makefile
+++ b/julia/mandelbrot/makefile
@@ -1,35 +0,0 @@
 
				-# GCC compiler
			
 
				-CC=gcc-9
			
 
				-CFLAGS += -O3 -mavx -mfma -fomit-frame-pointer -march=native -ffast-math $(shell pkg-config --cflags starpu-1.3)
			
 
				-
			
 
				-LDFLAGS +=$(shell pkg-config --libs starpu-1.3)
			
 
				-EXTERNLIB=extern_tasks.dylib
			
 
				-GENERATEDLIB=generated_tasks.dylib
			
 
				-OBJECTS=$(patsubst %.c,%.o,$(wildcard gen*.c))
			
 
				-LIBPATH=${PWD}/../StarPU.jl/lib
			
 
				-
			
 
				-all: ${EXTERNLIB} 
			
 
				-
			
 
				-mult: mult.c cpu_mult.o #gpu_mult.o 
			
 
				-	$(CC) $(CFLAGS) $^ -o $@ $(LDFLAGS)	
			
 
				-
			
 
				-gpu_mult.o: gpu_mult.cu
			
 
				-	nvcc -c $(CFLAGS) $^ -o $@
			
 
				-
			
 
				-%.o: %.c
			
 
				-	$(CC) -c $(CFLAGS) $^ -o $@
			
 
				-
			
 
				-${EXTERNLIB}: cpu_mandelbrot.o
			
 
				-	$(CC) -shared -fPIC $(LDFLAGS) $^ -o $@  
			
 
				-
			
 
				-gpu_mult.so: gpu_mult.o
			
 
				-	nvcc $(CFLAGS) $^ --shared --compiler-options '-fPIC' -o $@ $(LDFLAGS)
			
 
				-
			
 
				-${GENERATEDLIB}: ${OBJECTS}
			
 
				-	$(CC) -shared -fPIC $(LDFLAGS) $^ -o $@
			
 
				-
			
 
				-clean:
			
 
				-	rm *.so *.o *.dylib c_*.genc gencuda_*.cu *.dat
			
 
				-
			
 
				-
			
 
				-
			
--- a/julia/mandelbrot/mandelbrot.c
+++ b/julia/mandelbrot/mandelbrot.c
@@ -16,43 +16,33 @@
 
				 #include <stdio.h>
			
 
				 #include <stdlib.h>
			
 
				 #include <starpu.h>
			
 
				-#include "../display.h"
			
 
				 
			
 
				 void cpu_mandelbrot(void **, void *);
			
 
				 void gpu_mandelbrot(void **, void *);
			
 
				 
			
 
				-struct Params
			
 
				+static struct starpu_perfmodel model =
			
 
				 {
			
 
				-	float cr;
			
 
				-	float ci;
			
 
				-	unsigned taskx;
			
 
				-	unsigned tasky;
			
 
				-	unsigned width;
			
 
				-	unsigned height;
			
 
				+		.type = STARPU_HISTORY_BASED,
			
 
				+		.symbol = "history_perf"
			
 
				 };
			
 
				 
			
 
				-
			
 
				-
			
 
				-struct starpu_codelet cl =
			
 
				+static struct starpu_codelet cl =
			
 
				 {
			
 
				 	.cpu_funcs = {cpu_mandelbrot},
			
 
				-	.cuda_funcs = {gpu_mandelbrot},
			
 
				-	.nbuffers = 1,
			
 
				-	.modes = {STARPU_RW}
			
 
				+	//.cuda_funcs = {gpu_mandelbrot},
			
 
				+	.nbuffers = 2,
			
 
				+	.modes = {STARPU_W, STARPU_R},
			
 
				+	.model = &model
			
 
				 };
			
 
				 
			
 
				 
			
 
				-void mandelbrot_with_starpu(int *pixels, float cr, float ci, unsigned width, unsigned height, unsigned nslicesx, unsigned nslicesy)
			
 
				+void mandelbrot_with_starpu(long long *pixels, float *params, long long dim, long long nslicesx)
			
 
				 {
			
 
				-	starpu_data_handle_t p_handle;
			
 
				-
			
 
				-	starpu_matrix_data_register(&p_handle, STARPU_MAIN_RAM, (uintptr_t)pixels, width, width, height, sizeof(int));
			
 
				+	starpu_data_handle_t pixels_handle;
			
 
				+	starpu_data_handle_t params_handle;
			
 
				 
			
 
				-	struct starpu_data_filter vert =
			
 
				-	{
			
 
				-		.filter_func = starpu_matrix_filter_vertical_block,
			
 
				-		.nchildren = nslicesy
			
 
				-	};
			
 
				+	starpu_matrix_data_register(&pixels_handle, STARPU_MAIN_RAM, (uintptr_t)pixels, dim, dim, dim, sizeof(long long));
			
 
				+	starpu_matrix_data_register(&params_handle, STARPU_MAIN_RAM, (uintptr_t)params, 4*nslicesx, 4*nslicesx, 1, sizeof(float));
			
 
				 
			
 
				 	struct starpu_data_filter horiz =
			
 
				 	{
			
@@ -60,179 +50,100 @@ void mandelbrot_with_starpu(int *pixels, float cr, float ci, unsigned width, uns
 
				 		.nchildren = nslicesx
			
 
				 	};
			
 
				 
			
 
				-	starpu_data_map_filters(p_handle, 2, &vert, &horiz);
			
 
				+	starpu_data_partition(pixels_handle, &horiz);
			
 
				+	starpu_data_partition(params_handle, &horiz);
			
 
				 
			
 
				-	unsigned taskx, tasky;
			
 
				-
			
 
				-	struct Params *params = malloc(nslicesx*nslicesy*sizeof(struct Params));
			
 
				+	long long taskx;
			
 
				 
			
 
				 	for (taskx = 0; taskx < nslicesx; taskx++){
			
 
				-		for (tasky = 0; tasky < nslicesy; tasky++){
			
 
				-			struct starpu_task *task = starpu_task_create();
			
 
				-			
			
 
				-			task->cl = &cl;
			
 
				-			task->handles[0] = starpu_data_get_sub_data(p_handle, 2, tasky, taskx);
			
 
				-			struct Params param = {cr, ci, taskx, tasky, width, height};
			
 
				-
			
 
				-			params[taskx + tasky*nslicesx] = param;
			
 
				-
			
 
				-			task->cl_arg = (params + taskx + tasky * nslicesx);
			
 
				-			task->cl_arg_size = sizeof(struct Params);
			
 
				-			
			
 
				-			starpu_task_submit(task);
			
 
				-		}
			
 
				+		struct starpu_task *task = starpu_task_create();
			
 
				+
			
 
				+		task->cl = &cl;
			
 
				+		task->handles[0] = starpu_data_get_child(pixels_handle, taskx);
			
 
				+		task->handles[1] = starpu_data_get_child(params_handle, taskx);
			
 
				+		if (starpu_task_submit(task)!=0) fprintf(stderr,"submit task error\n");
			
 
				 	}
			
 
				-	starpu_task_wait_for_all();
			
 
				 
			
 
				-	starpu_data_unpartition(p_handle, STARPU_MAIN_RAM);
			
 
				+	starpu_task_wait_for_all();
			
 
				 
			
 
				-	starpu_data_unregister(p_handle);
			
 
				+	starpu_data_unpartition(pixels_handle, STARPU_MAIN_RAM);
			
 
				+	starpu_data_unpartition(params_handle, STARPU_MAIN_RAM);
			
 
				 
			
 
				-	free(params);
			
 
				+	starpu_data_unregister(pixels_handle);
			
 
				+	starpu_data_unregister(params_handle);
			
 
				 }
			
 
				 
			
 
				-void init_zero(int * pixels, unsigned width, unsigned height)
			
 
				+void pixels2img(long long *pixels, long long width, long long height, const char *filename)
			
 
				 {
			
 
				-	unsigned i,j;
			
 
				-	for (i = 0; i < height; i++){
			
 
				-		for (j = 0; j < width; j++){
			
 
				-			pixels[j + i*width] = 0;
			
 
				-		}
			
 
				-	}
			
 
				-}
			
 
				+  FILE *fp = fopen(filename, "w");
			
 
				+  if (!fp)
			
 
				+    return;
			
 
				 
			
 
				-void sort(double *arr, unsigned nbr_tests)
			
 
				-{
			
 
				-	unsigned j;
			
 
				-	
			
 
				-	int is_sort = 0;
			
 
				-	
			
 
				-	while (!is_sort){
			
 
				-
			
 
				-		is_sort = 1;
			
 
				-		
			
 
				-		for (j = 0; j < nbr_tests - 1; j++){
			
 
				-			if (arr[j] > arr[j+1]){
			
 
				-				is_sort = 0;
			
 
				-				double tmp = arr[j];
			
 
				-				arr[j] = arr[j+1];
			
 
				-				arr[j+1] = tmp;
			
 
				-			}
			
 
				-		}
			
 
				-	}
			
 
				-}
			
 
				-double median_time(float cr, float ci, unsigned width, unsigned height, unsigned nslicesx, unsigned nslicesy, unsigned nbr_tests)
			
 
				-{
			
 
				-	int *Pixels = malloc(width*height*sizeof(int));
			
 
				-	
			
 
				-	unsigned i;
			
 
				+  int MAPPING[16][3] = {{66,30,15},{25,7,26},{9,1,47},{4,4,73},{0,7,100},{12,44,138},{24,82,177},{57,125,209},{134,181,229},{211,236,248},{241,233,191},{248,201,95},{255,170,0},{204,128,0},{153,87,0},{106,52,3}};
			
 
				 
			
 
				-	double exec_times[nbr_tests];
			
 
				+  fprintf(fp, "P3\n%lld %lld\n255\n", width, height);
			
 
				+  long long i, j;
			
 
				+  for (i = 0; i < height; ++i) {
			
 
				+    for (j = 0; j < width; ++j) {
			
 
				+      fprintf(fp, "%d %d %d ", MAPPING[pixels[j*width+i]][0], MAPPING[pixels[j*width+i]][1], MAPPING[pixels[j*width+i]][2]);
			
 
				+    }
			
 
				+  }
			
 
				 
			
 
				-	double start, stop, exec_t;
			
 
				-	for (i = 0; i < nbr_tests; i++){
			
 
				-		init_zero(Pixels, width, height);
			
 
				-		
			
 
				-		start = starpu_timing_now(); // starpu_timing_now() gives the time in microseconds.
			
 
				-		mandelbrot_with_starpu(Pixels, cr, ci, width, height, nslicesx, nslicesy);
			
 
				-		stop = starpu_timing_now();
			
 
				-		
			
 
				-		exec_t = (stop-start)/1.e6;
			
 
				-		exec_times[i] = exec_t;
			
 
				-	}
			
 
				-	char filename[30];
			
 
				-	sprintf(filename, "PPM/mandelbrot%d.ppm", width);
			
 
				-	printf("%s\n", filename);
			
 
				-
			
 
				-	mandelbrot_graph(filename, Pixels, width, height);
			
 
				-
			
 
				-	free(Pixels);
			
 
				-
			
 
				-	sort(exec_times, nbr_tests);
			
 
				-
			
 
				-	return exec_times[nbr_tests/2];	
			
 
				+  fclose(fp);
			
 
				 }
			
 
				 
			
 
				-void fluctuation_time(float cr, float ci, unsigned width, unsigned height, unsigned nslicesx, unsigned nslicesy, unsigned nbr_tests, double *exec_times)
			
 
				+double min_times(double cr, double ci, long long dim, long long nslices)
			
 
				 {
			
 
				-	int *Pixels = malloc(width*height*sizeof(int));
			
 
				-	
			
 
				-	unsigned i;
			
 
				+	long long *pixels = calloc(dim*dim, sizeof(long long));
			
 
				+	float *params = calloc(4*nslices, sizeof(float));
			
 
				+
			
 
				+	double t_min = 0;
			
 
				+	long long i;
			
 
				+
			
 
				+	for (i=0; i<nslices; i++) {
			
 
				+		params[4*i+0] = cr;
			
 
				+		params[4*i+1] = ci;
			
 
				+		params[4*i+2] = i*dim/nslices;
			
 
				+		params[4*i+3] = dim;
			
 
				+	}
			
 
				 
			
 
				 	double start, stop, exec_t;
			
 
				-	for (i = 0; i < nbr_tests; i++){
			
 
				-		init_zero(Pixels, width, height);
			
 
				-		
			
 
				+	for (i = 0; i < 10; i++){
			
 
				 		start = starpu_timing_now(); // starpu_timing_now() gives the time in microseconds.
			
 
				-		mandelbrot_with_starpu(Pixels, cr, ci, width, height, nslicesx, nslicesy);
			
 
				+		mandelbrot_with_starpu(pixels, params, dim, nslices);
			
 
				 		stop = starpu_timing_now();
			
 
				-		
			
 
				-		exec_t = (stop-start)/1.e6;
			
 
				-		exec_times[i] = exec_t;
			
 
				-
			
 
				-		/* char filename[33]; */
			
 
				-		/* sprintf(filename, "../PPM/mandelbrot%d.ppm", i + 1); */
			
 
				-		/* printf("%s\n", filename); */
			
 
				-		/* mandelbrot_graph(filename, Pixels, width, height); */
			
 
				+		exec_t = (stop-start)*1.e3;
			
 
				+		if (t_min==0 || t_min>exec_t)
			
 
				+		  t_min = exec_t;
			
 
				 	}
			
 
				 
			
 
				+	char filename[64];
			
 
				+	snprintf(filename, 64, "out%lld.ppm", dim);
			
 
				+	pixels2img(pixels,dim,dim,filename);
			
 
				 
			
 
				-	free(Pixels);
			
 
				-
			
 
				-
			
 
				+	free(pixels);
			
 
				+	free(params);
			
 
				 
			
 
				-	
			
 
				+	return t_min;
			
 
				 }
			
 
				 
			
 
				-
			
 
				-void display_times(float cr, float ci, unsigned start_dim, unsigned step_dim, unsigned stop_dim, unsigned nslices, unsigned nbr_tests)
			
 
				+void display_times(double cr, double ci, long long start_dim, long long step_dim, long long stop_dim, long long nslices)
			
 
				 {
			
 
				-	
			
 
				-	unsigned dim;
			
 
				-
			
 
				-	FILE *myfile;
			
 
				-	myfile = fopen("DAT/mandelbrot_c_struct_times.dat", "w");
			
 
				-
			
 
				-	for (dim = start_dim; dim <= stop_dim; dim += step_dim){
			
 
				-		printf("Dimension: %u...\n", dim);
			
 
				-		double t = median_time(cr, ci, dim, dim, nslices, nslices, nbr_tests);
			
 
				-		
			
 
				-		printf("w = %u ; h = %u ; t = %f\n", dim, dim, t);
			
 
				-		
			
 
				-		fprintf(myfile, "%f\n", t);
			
 
				-		}
			
 
				-	
			
 
				-	fclose(myfile);
			
 
				-}
			
 
				 
			
 
				-void display_fluctuations(float cr, float ci, unsigned start_dim, unsigned step_dim, unsigned stop_dim, unsigned nslices, unsigned nbr_tests)
			
 
				-{
			
 
				-	
			
 
				-	unsigned dim;
			
 
				-
			
 
				-	FILE *myfile;
			
 
				-	myfile = fopen("DAT/mandelbrot_c_fluctuation.dat", "w");
			
 
				-
			
 
				-	double *exec_times = malloc(nbr_tests * sizeof(double));
			
 
				-	fluctuation_time(cr, ci, start_dim, start_dim, nslices, nslices, nbr_tests, exec_times);
			
 
				-		
			
 
				-	/* printf("w = %u ; h = %u ; t = %f\n", dim, dim, t); */
			
 
				-	unsigned i;
			
 
				-	for (i = 0; i < nbr_tests; i++){
			
 
				-		printf("test %u: %f seconds\n", i, exec_times[i]);
			
 
				-		fprintf(myfile, "%u %f\n", i, exec_times[i]);
			
 
				+	long long dim;
			
 
				+
			
 
				+	for (dim = start_dim; dim <= stop_dim; dim += step_dim) {
			
 
				+		printf("Dimension: %lld...\n", dim);
			
 
				+		double res = min_times(cr, ci, dim, nslices);
			
 
				+		res = res / dim / dim; // time per pixel
			
 
				+		printf("%lld %lf\n", dim, res);
			
 
				 	}
			
 
				-	
			
 
				-	fclose(myfile);
			
 
				-	free(exec_times);
			
 
				 }
			
 
				 
			
 
				-
			
 
				 int main(int argc, char **argv)
			
 
				 {
			
 
				-
			
 
				-	if (argc != 8){
			
 
				-		printf("Usage: %s cr ci start_dim step_dim stop_dim nslices(must divide dims) nbr_tests\n", argv[0]);
			
 
				+	if (argc != 7){
			
 
				+		printf("Usage: %s cr ci start_dim step_dim stop_dim nslices(must divide dims)\n", argv[0]);
			
 
				 		return 1;
			
 
				 	}
			
 
				 	if (starpu_init(NULL) != EXIT_SUCCESS){
			
@@ -240,24 +151,16 @@ int main(int argc, char **argv)
 
				 		return 77;
			
 
				 	}
			
 
				 
			
 
				+	double cr = (float) atof(argv[1]);
			
 
				+	double ci = (float) atof(argv[2]);
			
 
				+	long long start_dim = atoll(argv[3]);
			
 
				+	long long step_dim = atoll(argv[4]);
			
 
				+	long long stop_dim = atoll(argv[5]);
			
 
				+	long long nslices = atoll(argv[6]);
			
 
				 
			
 
				-	
			
 
				-	float cr = (float) atof(argv[1]);
			
 
				-	float ci = (float) atof(argv[2]);
			
 
				-	unsigned start_dim = (unsigned) atoi(argv[3]);
			
 
				-	unsigned step_dim = (unsigned) atoi(argv[4]);	
			
 
				-	unsigned stop_dim = (unsigned) atoi(argv[5]);
			
 
				-	unsigned nslices = (unsigned) atoi(argv[6]);
			
 
				-	unsigned nbr_tests = (unsigned) atoi(argv[7]);
			
 
				-
			
 
				-	display_times(cr, ci, start_dim, step_dim, stop_dim, nslices, nbr_tests);
			
 
				-	
			
 
				-	
			
 
				-	/* display_fluctuations(cr, ci, start_dim, step_dim, stop_dim, nslices, nbr_tests); */
			
 
				-
			
 
				+	display_times(cr, ci, start_dim, step_dim, stop_dim, nslices);
			
 
				 
			
 
				 	starpu_shutdown();
			
 
				 
			
 
				-
			
 
				 	return 0;
			
 
				 }
			
--- a/julia/mandelbrot/mandelbrot.jl
+++ b/julia/mandelbrot/mandelbrot.jl
@@ -3,7 +3,7 @@ using StarPU
 
				 using LinearAlgebra
			
 
				 
			
 
				 @target STARPU_CPU+STARPU_CUDA
			
 
				-@codelet function mandelbrot(pixels ::Matrix{Int64}, params ::Matrix{Float32} ) :: Float32
			
 
				+@codelet function mandelbrot(pixels ::Matrix{Int64}, centerr ::Float64, centeri ::Float64, offset ::Int64, dim ::Int64 ) :: Nothing
			
 
				     height :: Int64 = height(pixels)
			
 
				     width :: Int64 = width(pixels)
			
 
				     zoom :: Float64 = width * 0.25296875
			
@@ -11,10 +11,6 @@ using LinearAlgebra
 
				     diverge :: Float32 = 4.0
			
 
				     max_iterations :: Float32 = ((width/2) * 0.049715909 * log10(zoom));
			
 
				     imi :: Float32 = 1. / max_iterations
			
 
				-    centerr :: Float32 = params[1,1]
			
 
				-    centeri :: Float32 = params[2,1]
			
 
				-    offset :: Float32 = params[3,1]
			
 
				-    dim :: Float32 = params[4,1]
			
 
				     cr :: Float64 = 0.
			
 
				     zr :: Float64 = 0.
			
 
				     ci :: Float64 = 0.
			
@@ -27,7 +23,10 @@ using LinearAlgebra
 
				             zr = cr
			
 
				             ci = centeri + (y-1+offset - (dim / 2)) * iz
			
 
				             zi = ci
			
 
				-            for n = 0:max_iterations
			
 
				+            max_it :: Float64 = max_iterations
			
 
				+            n = 0
			
 
				+            for i = 0:max_it
			
 
				+                n = i
			
 
				                 if (zr*zr + zi*zi > diverge)
			
 
				                     break
			
 
				                 end
			
@@ -43,21 +42,20 @@ using LinearAlgebra
 
				             end
			
 
				         end
			
 
				     end
			
 
				-    return 0. :: Float32
			
 
				+
			
 
				+    return
			
 
				 end
			
 
				 
			
 
				-@debugprint "starpu_init"
			
 
				 starpu_init()
			
 
				 
			
 
				-function mandelbrot_with_starpu(A ::Matrix{Int64}, params ::Matrix{Float32}, nslicesx ::Int64)
			
 
				+function mandelbrot_with_starpu(A ::Matrix{Int64}, cr ::Float64, ci ::Float64, dim ::Int64, nslicesx ::Int64)
			
 
				     horiz = StarpuDataFilter(STARPU_MATRIX_FILTER_BLOCK, nslicesx)
			
 
				     @starpu_block let
			
 
				-	hA, hP = starpu_data_register(A,params)
			
 
				+	hA = starpu_data_register(A)
			
 
				 	starpu_data_partition(hA,horiz)
			
 
				-        starpu_data_partition(hP,horiz)
			
 
				-        
			
 
				+
			
 
				 	@starpu_sync_tasks for taskx in (1 : nslicesx)
			
 
				-                @starpu_async_cl mandelbrot(hA[taskx], hP[taskx]) [STARPU_W, STARPU_R]
			
 
				+                @starpu_async_cl mandelbrot(hA[taskx]) [STARPU_W] [cr, ci, (taskx-1)*dim/nslicesx, dim]
			
 
				 	end
			
 
				     end
			
 
				 end
			
@@ -79,16 +77,9 @@ function min_times(cr ::Float64, ci ::Float64, dim ::Int64, nslices ::Int64)
 
				     tmin=0;
			
 
				     
			
 
				     pixels ::Matrix{Int64} = zeros(dim, dim)
			
 
				-    params :: Matrix{Float32} = zeros(4*nslices,1)
			
 
				-    for i=0:(nslices-1)
			
 
				-        params[4*i+1,1] = cr
			
 
				-        params[4*i+2,1] = ci
			
 
				-        params[4*i+3,1] = i*dim/nslices
			
 
				-        params[4*i+4,1] = dim
			
 
				-    end
			
 
				     for i = 1:10
			
 
				         t = time_ns();
			
 
				-        mandelbrot_with_starpu(pixels, params, nslices)
			
 
				+        mandelbrot_with_starpu(pixels, cr, ci, dim, nslices)
			
 
				         t = time_ns()-t
			
 
				         if (tmin==0 || tmin>t)
			
 
				             tmin=t
			
@@ -109,6 +100,5 @@ end
 
				 
			
 
				 display_time(-0.800671,-0.158392,32,32,4096,4)
			
 
				 
			
 
				-@debugprint "starpu_shutdown"
			
 
				 starpu_shutdown()
			
 
				 
			
--- a/julia/mandelbrot/mandelbrot_native.jl
+++ b/julia/mandelbrot/mandelbrot_native.jl
@@ -0,0 +1,96 @@
 
				+using LinearAlgebra
			
 
				+
			
 
				+function mandelbrot(pixels, centerr ::Float64, centeri ::Float64, offset ::Int64, dim ::Int64) :: Nothing
			
 
				+    height :: Int64, width :: Int64 = size(pixels)
			
 
				+    zoom :: Float64 = width * 0.25296875
			
 
				+    iz :: Float64 = 1. / zoom
			
 
				+    diverge :: Float32 = 4.0
			
 
				+    max_iterations :: Float32 = ((width/2) * 0.049715909 * log10(zoom));
			
 
				+    imi :: Float64 = 1. / max_iterations
			
 
				+    cr :: Float64 = 0.
			
 
				+    zr :: Float64 = 0.
			
 
				+    ci :: Float64 = 0.
			
 
				+    zi :: Float64 = 0.
			
 
				+    n :: Int64 = 0
			
 
				+    tmp :: Float64 = 0.
			
 
				+    for y = 1:height
			
 
				+        for x = 1:width
			
 
				+            cr = centerr + (x-1 - (dim / 2)) * iz
			
 
				+            zr = cr
			
 
				+            ci = centeri + (y-1+offset - (dim / 2)) * iz
			
 
				+            zi = ci
			
 
				+            n = 0
			
 
				+            for i = 0:max_iterations
			
 
				+                n = i
			
 
				+                if (zr*zr + zi*zi > diverge)
			
 
				+                    break
			
 
				+                end
			
 
				+                tmp = zr*zr - zi*zi + cr
			
 
				+                zi = 2*zr*zi + ci
			
 
				+                zr = tmp
			
 
				+            end
			
 
				+
			
 
				+            if (n < max_iterations)
			
 
				+                pixels[y,x] = round(15 * n * imi)
			
 
				+            else
			
 
				+                pixels[y,x] = 0
			
 
				+            end
			
 
				+        end
			
 
				+    end
			
 
				+
			
 
				+    return
			
 
				+end
			
 
				+
			
 
				+function mandelbrot_without_starpu(A ::Matrix{Int64}, cr ::Float64, ci ::Float64, dim ::Int64, nslicesx ::Int64)
			
 
				+    width,height = size(A)
			
 
				+    step = height / nslicesx
			
 
				+
			
 
				+    for taskx in (1 : nslicesx)
			
 
				+        start_id = floor(Int64, (taskx-1)*step+1)
			
 
				+        end_id = floor(Int64, (taskx-1)*step+step)
			
 
				+        a = view(A, start_id:end_id, :)
			
 
				+
			
 
				+        offset ::Int64 = (taskx-1)*dim/nslicesx
			
 
				+        mandelbrot(a, cr, ci, offset, dim)
			
 
				+    end
			
 
				+end
			
 
				+
			
 
				+function pixels2img(pixels ::Matrix{Int64}, width ::Int64, height ::Int64, filename ::String)
			
 
				+    MAPPING = [[66,30,15],[25,7,26],[9,1,47],[4,4,73],[0,7,100],[12,44,138],[24,82,177],[57,125,209],[134,181,229],[211,236,248],[241,233,191],[248,201,95],[255,170,0],[204,128,0],[153,87,0],[106,52,3]]
			
 
				+    open(filename, "w") do f
			
 
				+        write(f, "P3\n$width $height\n255\n")
			
 
				+        for i = 1:height
			
 
				+            for j = 1:width
			
 
				+                write(f,"$(MAPPING[1+pixels[i,j]][1]) $(MAPPING[1+pixels[i,j]][2]) $(MAPPING[1+pixels[i,j]][3]) ")
			
 
				+            end
			
 
				+            write(f, "\n")
			
 
				+        end
			
 
				+    end
			
 
				+end
			
 
				+
			
 
				+function min_times(cr ::Float64, ci ::Float64, dim ::Int64, nslices ::Int64)
			
 
				+    tmin=0;
			
 
				+
			
 
				+    pixels ::Matrix{Int64} = zeros(dim, dim)
			
 
				+    for i = 1:10
			
 
				+        t = time_ns();
			
 
				+        mandelbrot_without_starpu(pixels, cr, ci, dim, nslices)
			
 
				+        t = time_ns()-t
			
 
				+        if (tmin==0 || tmin>t)
			
 
				+            tmin=t
			
 
				+        end
			
 
				+    end
			
 
				+    pixels2img(pixels,dim,dim,"out$(dim).ppm")
			
 
				+    return tmin
			
 
				+end
			
 
				+
			
 
				+function display_time(cr ::Float64, ci ::Float64, start_dim ::Int64, step_dim ::Int64, stop_dim ::Int64, nslices ::Int64)
			
 
				+    for dim in (start_dim : step_dim : stop_dim)
			
 
				+        res = min_times(cr, ci, dim, nslices)
			
 
				+        res=res/dim/dim; # time per pixel
			
 
				+        println("$(dim) $(res)")
			
 
				+    end
			
 
				+end
			
 
				+
			
 
				+
			
 
				+display_time(-0.800671,-0.158392,32,32,4096,4)
			
--- a/julia/mult/Makefile
+++ b/julia/mult/Makefile
@@ -6,40 +6,52 @@ STRIDE=72
 
				 #CFLAGS=-restrict -unroll4 -ipo -falign-loops=256 -O3 -DSTRIDE=${STRIDE} -march=native $(shell pkg-config --cflags starpu-1.3)
			
 
				 # GCC compiler
			
 
				 CC=gcc
			
 
				-CFLAGS += -O3 -DSTRIDE=${STRIDE} -mavx -fomit-frame-pointer -march=native -ffast-math $(shell pkg-config --cflags starpu-1.3)
			
 
				+NVCC=nvcc
			
 
				+ENABLE_CUDA=no
			
 
				+LD=$(CC)
			
 
				 
			
 
				+ifeq ($(ENABLE_CUDA),yes)
			
 
				+        LD := ${NVCC}
			
 
				+endif
			
 
				+
			
 
				+CFLAGS = -O3 -g -DSTRIDE=${STRIDE} $(shell pkg-config --cflags starpu-1.3)
			
 
				+CPU_CFLAGS = ${CFLAGS} -Wall -mavx -fomit-frame-pointer -march=native -ffast-math
			
 
				+CUDA_CFLAGS = ${CFLAGS}
			
 
				 LDFLAGS +=$(shell pkg-config --libs starpu-1.3)
			
 
				+
			
 
				 EXTERNLIB=extern_tasks.so
			
 
				 GENERATEDLIB=generated_tasks.so
			
 
				-#OBJECTS=$(patsubst %.c,%.o,$(wildcard gen*.c))
			
 
				-OBJECTS=$(wildcard gen*.c)
			
 
				+
			
 
				+C_OBJECTS=$(patsubst %.c,%.o,$(wildcard gen*.c))
			
 
				+CUDA_OBJECTS=$(patsubst %.cu,%.o,$(wildcard gen*.cu))
			
 
				+ifneq ($(ENABLE_CUDA),yes)
			
 
				+	CUDA_OBJECTS:=
			
 
				+endif
			
 
				+
			
 
				+
			
 
				 LIBPATH=${PWD}/../StarPU.jl/lib
			
 
				 
			
 
				 all: ${EXTERNLIB}
			
 
				 
			
 
				 mult: mult.c cpu_mult.o #gpu_mult.o
			
 
				-	$(CC) $(CFLAGS) $^ -o $@ $(LDFLAGS)
			
 
				-
			
 
				-gpu_mult.o: gpu_mult.cu
			
 
				-	nvcc -c $(CFLAGS) $^ -o $@
			
 
				+	$(CC) $(CPU_CFLAGS) $^ -o $@ $(LDFLAGS)
			
 
				 
			
 
				 %.o: %.c
			
 
				-	$(CC) -c $(CFLAGS) $^ -o $@
			
 
				+	$(CC) -c -fPIC $(CPU_CFLAGS) $^ -o $@
			
 
				+
			
 
				+%.o: %.cu
			
 
				+	$(NVCC) -dc $(CUDA_CFLAGS) $^ --shared --compiler-options '-fPIC' -o $@ $(LDFLAGS)
			
 
				 
			
 
				 ${EXTERNLIB}: cpu_mult.c
			
 
				 	$(CC) $(CFLAGS) -shared -fPIC $(LDFLAGS) $^ -o $@
			
 
				 
			
 
				-gpu_mult.so: gpu_mult.o
			
 
				-	nvcc $(CFLAGS) $^ --shared --compiler-options '-fPIC' -o $@ $(LDFLAGS)
			
 
				+${GENERATEDLIB}: $(C_OBJECTS) $(CUDA_OBJECTS)
			
 
				+	$(LD) -shared $(LDFLAGS) $^ -o $@
			
 
				 
			
 
				-cpu_mult_sa: cpu_mult_sa.o
			
 
				-	$(CC) $(CFLAGS) $^ -o $@ $(LDFLAGS)
			
 
				-
			
 
				-${GENERATEDLIB}: ${OBJECTS}
			
 
				-	$(CC) $(CFLAGS) -shared -fPIC $(LDFLAGS) $^ -o $@
			
 
				+.PHONY: clean
			
 
				 
			
 
				 clean:
			
 
				-	rm -f mult *.so *.o c_*.genc gencuda_*.cu *.dat
			
 
				+	rm -f mult *.so *.o genc_*.c gencuda_*.cu *.dat
			
 
				 
			
 
				 # Performance Tests
			
 
				 cstarpu.dat: mult
			
--- a/julia/mult/mult.jl
+++ b/julia/mult/mult.jl
@@ -6,58 +6,57 @@ using LinearAlgebra
 
				 const STRIDE = 72
			
 
				 
			
 
				 @target STARPU_CPU+STARPU_CUDA
			
 
				-@codelet function matrix_mult(m1 :: Matrix{Float32}, m2 :: Matrix{Float32}, m3 :: Matrix{Float32}) :: Float32
			
 
				+@codelet function matrix_mult(m1 :: Matrix{Float32}, m2 :: Matrix{Float32}, m3 :: Matrix{Float32}) :: Nothing
			
 
				 
			
 
				     width_m2 :: Int32 = width(m2)
			
 
				     height_m1 :: Int32 = height(m1)
			
 
				     width_m1 :: Int32 = width(m1)
			
 
				     # Naive version
			
 
				-    #@parallel for j in (1 : width_m2)
			
 
				-    #    @parallel for i in (1 : height_m1)
			
 
				-    #
			
 
				-    #          sum :: Float32 = 0.
			
 
				-
			
 
				-    #          for k in (1 : width_m1)
			
 
				-    #              sum = sum + m1[i, k] * m2[k, j]
			
 
				-    #          end
			
 
				+    @parallel for j in (1 : width_m2)
			
 
				+       @parallel for i in (1 : height_m1)
			
 
				     
			
 
				-    #          m3[i, j] = sum
			
 
				-    #      end
			
 
				-    #  end
			
 
				-    ##### Tiled and unrolled version 
			
 
				-    for l in (1 : width_m2)
			
 
				-        for m in (1 : height_m1)
			
 
				-            m3[m,l] = 0
			
 
				-        end
			
 
				-    end
			
 
				-    @parallel for i in (1 : STRIDE : height_m1)
			
 
				-        for k in (1 : STRIDE : width_m1 )
			
 
				-            for j in (1 : STRIDE : width_m2  )
			
 
				-                for kk in (k : 4 : k+STRIDE-1)
			
 
				-                    for jj in (j : 2 : j+STRIDE-1)
			
 
				-                        alpha00 :: Float32 =m2[kk,jj]
			
 
				-                        alpha01 :: Float32 =m2[kk,jj+1]
			
 
				-                        alpha10 :: Float32 =m2[kk+1,jj]
			
 
				-                        alpha11 :: Float32 =m2[kk+1,jj+1]
			
 
				-                        alpha20 :: Float32 =m2[kk+2,jj]
			
 
				-                        alpha21 :: Float32 =m2[kk+2,jj+1]
			
 
				-                        alpha30 :: Float32 =m2[kk+3,jj]
			
 
				-                        alpha31 :: Float32 =m2[kk+3,jj+1]
			
 
				-                        for ii in (i : 1 : i+STRIDE-1) 
			
 
				-                            m3[ii, jj] = m3[ii, jj] + m1[ii, kk] * alpha00 + m1[ii, kk+1] * alpha10 + m1[ii, kk+2] * alpha20 + m1[ii,kk+3]*alpha30
			
 
				-                            m3[ii, jj+1] = m3[ii, jj+1] + m1[ii, kk] * alpha01 + m1[ii, kk+1] * alpha11 + m1[ii, kk+2]*alpha21 + m1[ii,kk+3]*alpha31 
			
 
				-                        end
			
 
				-                    end
			
 
				-                end
			
 
				-            end
			
 
				-        end
			
 
				-    end
			
 
				+             sum :: Float32 = 0.
			
 
				 
			
 
				-    return 0. :: Float32
			
 
				+             for k in (1 : width_m1)
			
 
				+                 sum = sum + m1[i, k] * m2[k, j]
			
 
				+             end
			
 
				+    
			
 
				+             m3[i, j] = sum
			
 
				+         end
			
 
				+     end
			
 
				+    # ##### Tiled and unrolled version 
			
 
				+    # for l in (1 : width_m2)
			
 
				+    #     for m in (1 : height_m1)
			
 
				+    #         m3[m,l] = 0
			
 
				+    #     end
			
 
				+    # end
			
 
				+    # @parallel for i in (1 : STRIDE : height_m1)
			
 
				+    #     for k in (1 : STRIDE : width_m1 )
			
 
				+    #         for j in (1 : STRIDE : width_m2  )
			
 
				+    #             for kk in (k : 4 : k+STRIDE-1)
			
 
				+    #                 for jj in (j : 2 : j+STRIDE-1)
			
 
				+    #                     alpha00 :: Float32 =m2[kk,jj]
			
 
				+    #                     alpha01 :: Float32 =m2[kk,jj+1]
			
 
				+    #                     alpha10 :: Float32 =m2[kk+1,jj]
			
 
				+    #                     alpha11 :: Float32 =m2[kk+1,jj+1]
			
 
				+    #                     alpha20 :: Float32 =m2[kk+2,jj]
			
 
				+    #                     alpha21 :: Float32 =m2[kk+2,jj+1]
			
 
				+    #                     alpha30 :: Float32 =m2[kk+3,jj]
			
 
				+    #                     alpha31 :: Float32 =m2[kk+3,jj+1]
			
 
				+    #                     for ii in (i : 1 : i+STRIDE-1) 
			
 
				+    #                         m3[ii, jj] = m3[ii, jj] + m1[ii, kk] * alpha00 + m1[ii, kk+1] * alpha10 + m1[ii, kk+2] * alpha20 + m1[ii,kk+3]*alpha30
			
 
				+    #                         m3[ii, jj+1] = m3[ii, jj+1] + m1[ii, kk] * alpha01 + m1[ii, kk+1] * alpha11 + m1[ii, kk+2]*alpha21 + m1[ii,kk+3]*alpha31 
			
 
				+    #                     end
			
 
				+    #                 end
			
 
				+    #             end
			
 
				+    #         end
			
 
				+    #     end
			
 
				+    # end
			
 
				+
			
 
				+    return
			
 
				 end
			
 
				 
			
 
				 
			
 
				-@debugprint "starpu_init"
			
 
				 starpu_init()
			
 
				 
			
 
				 function multiply_with_starpu(A :: Matrix{Float32}, B :: Matrix{Float32}, C :: Matrix{Float32}, nslicesx, nslicesy)
			
@@ -77,7 +76,7 @@ function multiply_with_starpu(A :: Matrix{Float32}, B :: Matrix{Float32}, C :: M
 
				         )
			
 
				         cl = StarpuCodelet(
			
 
				             cpu_func = CPU_CODELETS["matrix_mult"],
			
 
				-            #cuda_func = "matrix_mult",
			
 
				+            # cuda_func = CUDA_CODELETS["matrix_mult"],
			
 
				             #opencl_func="ocl_matrix_mult",
			
 
				             modes = [STARPU_R, STARPU_R, STARPU_W],
			
 
				             perfmodel = perfmodel
			
@@ -141,6 +140,6 @@ end
 
				 io=open(ARGS[1],"w")
			
 
				 compute_times(io,16*STRIDE,4*STRIDE,4096,2,2)
			
 
				 close(io)
			
 
				-@debugprint "starpu_shutdown"
			
 
				+
			
 
				 starpu_shutdown()
			
 
				 
			
--- a/julia/mult/mult_native.jl
+++ b/julia/mult/mult_native.jl
@@ -5,9 +5,6 @@ using LinearAlgebra
 
				 #shoud be the same as in the makefile
			
 
				 const STRIDE = 72
			
 
				 
			
 
				-@debugprint "starpu_init"
			
 
				-starpu_init()
			
 
				-
			
 
				 function multiply_without_starpu(A :: Matrix{Float32}, B :: Matrix{Float32}, C :: Matrix{Float32}, nslicesx, nslicesy)
			
 
				     tmin = 0
			
 
				     for i in (1 : 10 )
			
@@ -39,6 +36,4 @@ end
 
				 io=open(ARGS[1],"w")
			
 
				 compute_times(io,16*STRIDE,4*STRIDE,4096,2,2)
			
 
				 close(io)
			
 
				-@debugprint "starpu_shutdown"
			
 
				-starpu_shutdown()
			
 
				 
			
--- a/julia/variable/Makefile
+++ b/julia/variable/Makefile
@@ -0,0 +1,58 @@
 
				+CC=gcc
			
 
				+NVCC=nvcc
			
 
				+ENABLE_CUDA=no
			
 
				+LD=$(CC)
			
 
				+
			
 
				+ifeq ($(ENABLE_CUDA),yes)
			
 
				+        LD := ${NVCC}
			
 
				+endif
			
 
				+
			
 
				+CFLAGS = -O3 -g $(shell pkg-config --cflags starpu-1.3)
			
 
				+CPU_CFLAGS = ${CFLAGS} -Wall -mavx -fomit-frame-pointer -march=native -ffast-math
			
 
				+CUDA_CFLAGS = ${CFLAGS}
			
 
				+LDFLAGS +=$(shell pkg-config --libs starpu-1.3)
			
 
				+
			
 
				+EXTERNLIB=extern_tasks.so
			
 
				+GENERATEDLIB=generated_tasks.so
			
 
				+
			
 
				+C_OBJECTS=$(patsubst %.c,%.o,$(wildcard gen*.c))
			
 
				+CUDA_OBJECTS=$(patsubst %.cu,%.o,$(wildcard gen*.cu))
			
 
				+ifneq ($(ENABLE_CUDA),yes)
			
 
				+	CUDA_OBJECTS:=
			
 
				+endif
			
 
				+
			
 
				+LIBPATH=${PWD}/../StarPU.jl/lib
			
 
				+
			
 
				+all: ${EXTERNLIB}
			
 
				+
			
 
				+variable: variable.c cpu_variable.o #gpu_variable.o
			
 
				+	$(CC) $(CPU_CFLAGS) $^ -o $@ $(LDFLAGS)
			
 
				+
			
 
				+%.o: %.c
			
 
				+	$(CC) -c -fPIC $(CPU_CFLAGS) $^ -o $@
			
 
				+
			
 
				+%.o: %.cu
			
 
				+	$(NVCC) -dc $(CUDA_CFLAGS) $^ --shared --compiler-options '-fPIC' -o $@ $(LDFLAGS)
			
 
				+
			
 
				+${EXTERNLIB}: cpu_variable.c
			
 
				+	$(CC) $(CFLAGS) -shared -fPIC $(LDFLAGS) $^ -o $@
			
 
				+
			
 
				+${GENERATEDLIB}: $(C_OBJECTS) $(CUDA_OBJECTS)
			
 
				+	$(LD) -shared $(LDFLAGS) $^ -o $@
			
 
				+
			
 
				+.PHONY: clean
			
 
				+
			
 
				+clean:
			
 
				+	rm -f variable *.so *.o genc_*.c gencuda_*.cu *.dat
			
 
				+
			
 
				+# Performance Tests
			
 
				+cstarpu.dat: variable
			
 
				+	STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 ./variable -0.800671 -0.158392 32 32 4096 4 > $@
			
 
				+julia_generatedc.dat:
			
 
				+	LD_LIBRARY_PATH+=${LIBPATH} STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 julia variable.jl $@
			
 
				+julia_native.dat:
			
 
				+	LD_LIBRARY_PATH+=${LIBPATH} STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 julia variable_native.jl $@
			
 
				+julia_calllib.dat: ${EXTERNLIB}
			
 
				+	LD_LIBRARY_PATH+=${LIBPATH} JULIA_TASK_LIB="${EXTERNLIB}" STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 julia variable.jl julia_calllib.dat
			
 
				+
			
 
				+test: cstarpu.dat julia_generatedc.dat julia_native.dat julia_calllib.dat
			
--- a/julia/variable/variable.jl
+++ b/julia/variable/variable.jl
@@ -0,0 +1,38 @@
 
				+import Libdl
			
 
				+using StarPU
			
 
				+
			
 
				+@target STARPU_CPU
			
 
				+@codelet function variable(val ::Ref{Float32}) :: Nothing
			
 
				+    val[] = val[] + 1
			
 
				+
			
 
				+    return
			
 
				+end
			
 
				+
			
 
				+starpu_init()
			
 
				+
			
 
				+function variable_with_starpu(val ::Ref{Float32}, niter)
			
 
				+    @starpu_block let
			
 
				+	hVal = starpu_data_register(val)
			
 
				+
			
 
				+	@starpu_sync_tasks for task in (1 : niter)
			
 
				+                @starpu_async_cl variable(hVal) [STARPU_RW]
			
 
				+	end
			
 
				+    end
			
 
				+end
			
 
				+
			
 
				+function display(niter)
			
 
				+    foo = Ref(0.0f0)
			
 
				+
			
 
				+    variable_with_starpu(foo, niter)
			
 
				+
			
 
				+    println("variable -> ", foo[])
			
 
				+    if foo[] == niter
			
 
				+        println("result is correct")
			
 
				+    else
			
 
				+        println("result is incorret")
			
 
				+    end
			
 
				+end
			
 
				+
			
 
				+display(10)
			
 
				+
			
 
				+starpu_shutdown()
			
--- a/julia/variable/variable_native.jl
+++ b/julia/variable/variable_native.jl
@@ -0,0 +1,26 @@
 
				+function variable(val ::Ref{Float32}) :: Nothing
			
 
				+    val[] = val[] + 1
			
 
				+
			
 
				+    return
			
 
				+end
			
 
				+
			
 
				+function variable_without_starpu(val ::Ref{Float32}, niter)
			
 
				+    for i = 1:niter
			
 
				+        variable(val)
			
 
				+    end
			
 
				+end
			
 
				+
			
 
				+function display(niter)
			
 
				+    foo = Ref(0.0f0)
			
 
				+
			
 
				+    variable_without_starpu(foo, niter)
			
 
				+
			
 
				+    println("variable -> ", foo[])
			
 
				+    if foo[] == niter
			
 
				+        println("result is correct")
			
 
				+    else
			
 
				+        println("result is incorret")
			
 
				+    end
			
 
				+end
			
 
				+
			
 
				+display(10)
			
--- a/julia/vector_scal/Makefile
+++ b/julia/vector_scal/Makefile
@@ -0,0 +1,58 @@
 
				+CC=gcc
			
 
				+NVCC=nvcc
			
 
				+ENABLE_CUDA=no
			
 
				+LD=$(CC)
			
 
				+
			
 
				+ifeq ($(ENABLE_CUDA),yes)
			
 
				+        LD := ${NVCC}
			
 
				+endif
			
 
				+
			
 
				+CFLAGS = -O3 -g $(shell pkg-config --cflags starpu-1.3)
			
 
				+CPU_CFLAGS = ${CFLAGS} -Wall -mavx -fomit-frame-pointer -march=native -ffast-math
			
 
				+CUDA_CFLAGS = ${CFLAGS}
			
 
				+LDFLAGS +=$(shell pkg-config --libs starpu-1.3)
			
 
				+
			
 
				+EXTERNLIB=extern_tasks.so
			
 
				+GENERATEDLIB=generated_tasks.so
			
 
				+
			
 
				+C_OBJECTS=$(patsubst %.c,%.o,$(wildcard gen*.c))
			
 
				+CUDA_OBJECTS=$(patsubst %.cu,%.o,$(wildcard gen*.cu))
			
 
				+ifneq ($(ENABLE_CUDA),yes)
			
 
				+	CUDA_OBJECTS:=
			
 
				+endif
			
 
				+
			
 
				+LIBPATH=${PWD}/../StarPU.jl/lib
			
 
				+
			
 
				+all: ${EXTERNLIB}
			
 
				+
			
 
				+vector_scal: vector_scal.c cpu_vector_scal.o #gpu_vector_scal.o
			
 
				+	$(CC) $(CPU_CFLAGS) $^ -o $@ $(LDFLAGS)
			
 
				+
			
 
				+%.o: %.c
			
 
				+	$(CC) -c -fPIC $(CPU_CFLAGS) $^ -o $@
			
 
				+
			
 
				+%.o: %.cu
			
 
				+	$(NVCC) -dc $(CUDA_CFLAGS) $^ --shared --compiler-options '-fPIC' -o $@ $(LDFLAGS)
			
 
				+
			
 
				+${EXTERNLIB}: cpu_vector_scal.c
			
 
				+	$(CC) $(CFLAGS) -shared -fPIC $(LDFLAGS) $^ -o $@
			
 
				+
			
 
				+${GENERATEDLIB}: $(C_OBJECTS) $(CUDA_OBJECTS)
			
 
				+	$(LD) -shared $(LDFLAGS) $^ -o $@
			
 
				+
			
 
				+PHONY: clean
			
 
				+
			
 
				+clean:
			
 
				+	rm -f vector_scal *.so *.o genc_*.c gencuda_*.cu *.dat
			
 
				+
			
 
				+# Performance Tests
			
 
				+cstarpu.dat: vector_scal
			
 
				+	STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 ./vector_scal > $@
			
 
				+julia_generatedc.dat:
			
 
				+	LD_LIBRARY_PATH+=${LIBPATH} STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 julia vector_scal.jl $@
			
 
				+julia_native.dat:
			
 
				+	LD_LIBRARY_PATH+=${LIBPATH} STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 julia vector_scal_native.jl $@
			
 
				+julia_calllib.dat: ${EXTERNLIB}
			
 
				+	LD_LIBRARY_PATH+=${LIBPATH} JULIA_TASK_LIB="${EXTERNLIB}" STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 julia vector_scal.jl julia_calllib.dat
			
 
				+
			
 
				+test: cstarpu.dat julia_generatedc.dat julia_native.dat julia_calllib.dat
			
--- a/julia/vector_scal/cpu_vector_scal.c
+++ b/julia/vector_scal/cpu_vector_scal.c
@@ -0,0 +1,42 @@
 
				+#include <stdio.h>
			
 
				+#include <stdint.h>
			
 
				+#include <starpu.h>
			
 
				+#include <math.h>
			
 
				+
			
 
				+struct params {
			
 
				+  int32_t m;
			
 
				+  float k;
			
 
				+  float l;
			
 
				+};
			
 
				+
			
 
				+float cpu_vector_scal(void *buffers[], void *cl_arg)
			
 
				+{
			
 
				+  /* get scalar parameters from cl_arg */
			
 
				+  struct params *scalars = (struct params *) cl_arg;
			
 
				+  int m = scalars->m;
			
 
				+  float k = scalars->k;
			
 
				+  float l = scalars->l;
			
 
				+
			
 
				+  struct starpu_vector_interface *vector = (struct starpu_vector_interface *) buffers[0];
			
 
				+
			
 
				+  /* length of the vector */
			
 
				+  unsigned n = STARPU_VECTOR_GET_NX(vector);
			
 
				+
			
 
				+  /* get a pointer to the local copy of the vector : note that we have to
			
 
				+   * cast it in (float *) since a vector could contain any type of
			
 
				+   * elements so that the .ptr field is actually a uintptr_t */
			
 
				+  float *val = (float *)STARPU_VECTOR_GET_PTR(vector);
			
 
				+
			
 
				+  /* scale the vector */
			
 
				+  for (unsigned i = 0; i < n; i++)
			
 
				+    val[i] = val[i] * k + l + m;
			
 
				+
			
 
				+  return 0.0;
			
 
				+}
			
 
				+
			
 
				+char* CPU = "cpu_vector_scal";
			
 
				+char* GPU = "gpu_vector_scal";
			
 
				+extern char *starpu_find_function(char *name, char *device) {
			
 
				+	if (!strcmp(device,"gpu")) return GPU;
			
 
				+	return CPU;
			
 
				+}
			
--- a/julia/vector_scal/vector_scal.jl
+++ b/julia/vector_scal/vector_scal.jl
@@ -0,0 +1,75 @@
 
				+import Libdl
			
 
				+using StarPU
			
 
				+using LinearAlgebra
			
 
				+
			
 
				+@target STARPU_CPU+STARPU_CUDA
			
 
				+@codelet function vector_scal(m::Int32, v :: Vector{Float32}, k :: Float32, l :: Float32) :: Float32
			
 
				+
			
 
				+    N :: Int32 = length(v)
			
 
				+    # Naive version
			
 
				+    @parallel for i in (1 : N)
			
 
				+        v[i] = v[i] * m + l + k
			
 
				+    end
			
 
				+end
			
 
				+
			
 
				+
			
 
				+starpu_init()
			
 
				+
			
 
				+function vector_scal_with_starpu(v :: Vector{Float32}, m :: Int32, k :: Float32, l :: Float32)
			
 
				+    tmin=0
			
 
				+
			
 
				+    @starpu_block let
			
 
				+        hV = starpu_data_register(v)
			
 
				+        tmin=0
			
 
				+        perfmodel = StarpuPerfmodel(
			
 
				+            perf_type = STARPU_HISTORY_BASED,
			
 
				+            symbol = "history_perf"
			
 
				+        )
			
 
				+        cl = StarpuCodelet(
			
 
				+            cpu_func = CPU_CODELETS["vector_scal"],
			
 
				+            # cuda_func = CUDA_CODELETS["vector_scal"],
			
 
				+            #opencl_func="ocl_matrix_mult",
			
 
				+            modes = [STARPU_RW],
			
 
				+            perfmodel = perfmodel
			
 
				+        )
			
 
				+
			
 
				+        for i in (1 : 1)
			
 
				+            t=time_ns()
			
 
				+            @starpu_sync_tasks begin
			
 
				+                handles = [hV]
			
 
				+                task = StarpuTask(cl = cl, handles = handles, cl_arg=[m, k, l])
			
 
				+                starpu_task_submit(task)
			
 
				+            end
			
 
				+            # @starpu_sync_tasks for task in (1:1)
			
 
				+            #     @starpu_async_cl vector_scal(hV, STARPU_RW, [m, k, l])
			
 
				+            # end
			
 
				+            t=time_ns()-t
			
 
				+            if (tmin==0 || tmin>t)
			
 
				+                tmin=t
			
 
				+            end
			
 
				+        end
			
 
				+    end
			
 
				+    return tmin
			
 
				+end
			
 
				+
			
 
				+function compute_times(io,start_dim, step_dim, stop_dim)
			
 
				+    for size in (start_dim : step_dim : stop_dim)
			
 
				+        V = Array(rand(Cfloat, size))
			
 
				+        m :: Int32 = 10
			
 
				+        k :: Float32 = 2.
			
 
				+        l :: Float32 = 3.
			
 
				+        println("INPUT ", V[1:10])
			
 
				+        mt =  vector_scal_with_starpu(V, m, k, l)
			
 
				+        println("OUTPUT ", V[1:10])
			
 
				+        println(io,"$size $mt")
			
 
				+        println("$size $mt")
			
 
				+    end
			
 
				+end
			
 
				+
			
 
				+
			
 
				+io=open(ARGS[1],"w")
			
 
				+compute_times(io,1024,1024,4096)
			
 
				+close(io)
			
 
				+
			
 
				+starpu_shutdown()
			
 
				+
			
--- a/mpi/examples/matrix_decomposition/mpi_cholesky_codelets.c
+++ b/mpi/examples/matrix_decomposition/mpi_cholesky_codelets.c
@@ -115,6 +115,7 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 
				 		}
			
 
				 	}
			
 
				 
			
 
				+	starpu_mpi_wait_for_all(MPI_COMM_WORLD);
			
 
				 	starpu_mpi_barrier(MPI_COMM_WORLD);
			
 
				 	start = starpu_timing_now();
			
 
				 
			
@@ -159,9 +160,9 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 
				 		starpu_iteration_pop();
			
 
				 	}
			
 
				 
			
 
				-	starpu_task_wait_for_all();
			
 
				-
			
 
				+	starpu_mpi_wait_for_all(MPI_COMM_WORLD);
			
 
				 	starpu_mpi_barrier(MPI_COMM_WORLD);
			
 
				+
			
 
				 	end = starpu_timing_now();
			
 
				 
			
 
				 	for (m = 0; m < nblocks; m++)
			
--- a/mpi/examples/mpi_lu/pxlu.c
+++ b/mpi/examples/mpi_lu/pxlu.c
@@ -899,6 +899,8 @@ double STARPU_PLU(plu_main)(unsigned _nblocks, int _rank, int _world_size, unsig
 
				 		starpu_iteration_pop();
			
 
				 	}
			
 
				 
			
 
				+	int wait_ret = starpu_mpi_wait_for_all(MPI_COMM_WORLD);
			
 
				+	STARPU_ASSERT(wait_ret == MPI_SUCCESS);
			
 
				 	int barrier_ret = starpu_mpi_barrier(MPI_COMM_WORLD);
			
 
				 	STARPU_ASSERT(barrier_ret == MPI_SUCCESS);
			
 
				 
			
--- a/mpi/examples/mpi_lu/pxlu_implicit.c
+++ b/mpi/examples/mpi_lu/pxlu_implicit.c
@@ -57,7 +57,7 @@ static void create_task_11(unsigned k)
 
				 static void create_task_12(unsigned k, unsigned j)
			
 
				 {
			
 
				 #ifdef STARPU_DEVEL
			
 
				-#warning temporary fix 
			
 
				+#warning temporary fix
			
 
				 #endif
			
 
				 	starpu_mpi_task_insert(MPI_COMM_WORLD,
			
 
				 			       //&STARPU_PLU(cl12),
			
@@ -79,7 +79,7 @@ static void create_task_12(unsigned k, unsigned j)
 
				 static void create_task_21(unsigned k, unsigned i)
			
 
				 {
			
 
				 #ifdef STARPU_DEVEL
			
 
				-#warning temporary fix 
			
 
				+#warning temporary fix
			
 
				 #endif
			
 
				 	starpu_mpi_task_insert(MPI_COMM_WORLD,
			
 
				 			       //&STARPU_PLU(cl21),
			
@@ -114,13 +114,14 @@ static void create_task_22(unsigned k, unsigned i, unsigned j)
 
				 }
			
 
				 
			
 
				 /*
			
 
				- *	code to bootstrap the factorization 
			
 
				+ *	code to bootstrap the factorization
			
 
				  */
			
 
				 
			
 
				 double STARPU_PLU(plu_main)(unsigned _nblocks, int _rank, int _world_size, unsigned _no_prio)
			
 
				 {
			
 
				 	double start;
			
 
				 	double end;
			
 
				+	int ret;
			
 
				 
			
 
				 	nblocks = _nblocks;
			
 
				 	rank = _rank;
			
@@ -130,7 +131,10 @@ double STARPU_PLU(plu_main)(unsigned _nblocks, int _rank, int _world_size, unsig
 
				 	/* create all the DAG nodes */
			
 
				 	unsigned i,j,k;
			
 
				 
			
 
				-	starpu_mpi_barrier(MPI_COMM_WORLD);
			
 
				+	ret = starpu_mpi_wait_for_all(MPI_COMM_WORLD);
			
 
				+	STARPU_ASSERT(ret == MPI_SUCCESS);
			
 
				+	ret = starpu_mpi_barrier(MPI_COMM_WORLD);
			
 
				+	STARPU_ASSERT(ret == MPI_SUCCESS);
			
 
				 
			
 
				 	start = starpu_timing_now();
			
 
				 
			
@@ -170,15 +174,16 @@ double STARPU_PLU(plu_main)(unsigned _nblocks, int _rank, int _world_size, unsig
 
				 		starpu_iteration_pop();
			
 
				 	}
			
 
				 
			
 
				-	starpu_task_wait_for_all();
			
 
				-
			
 
				-	starpu_mpi_barrier(MPI_COMM_WORLD);
			
 
				+	ret = starpu_mpi_wait_for_all(MPI_COMM_WORLD);
			
 
				+	STARPU_ASSERT(ret == MPI_SUCCESS);
			
 
				+	ret = starpu_mpi_barrier(MPI_COMM_WORLD);
			
 
				+	STARPU_ASSERT(ret == MPI_SUCCESS);
			
 
				 
			
 
				 	end = starpu_timing_now();
			
 
				 
			
 
				 	double timing = end - start;
			
 
				-	
			
 
				+
			
 
				 //	fprintf(stderr, "RANK %d -> took %f ms\n", rank, timing/1000);
			
 
				-	
			
 
				+
			
 
				 	return timing;
			
 
				 }
			
--- a/mpi/examples/user_datatype/user_datatype.c
+++ b/mpi/examples/user_datatype/user_datatype.c
@@ -120,8 +120,8 @@ int main(int argc, char **argv)
 
				 		starpu_mpi_isend_detached(handle0, 0, 20, MPI_COMM_WORLD, NULL, NULL);
			
 
				 	}
			
 
				 
			
 
				-	starpu_mpi_barrier(MPI_COMM_WORLD);
			
 
				 	starpu_mpi_wait_for_all(MPI_COMM_WORLD);
			
 
				+	starpu_mpi_barrier(MPI_COMM_WORLD);
			
 
				 
			
 
				 	starpu_mpi_datatype_unregister(handle0);
			
 
				 	starpu_data_unregister(handle0);
			
--- a/mpi/examples/user_datatype/user_datatype2.c
+++ b/mpi/examples/user_datatype/user_datatype2.c
@@ -80,8 +80,8 @@ int main(int argc, char **argv)
 
				 		starpu_mpi_isend_detached(handle0, 0, 20, MPI_COMM_WORLD, NULL, NULL);
			
 
				 	}
			
 
				 
			
 
				-	starpu_mpi_barrier(MPI_COMM_WORLD);
			
 
				 	starpu_mpi_wait_for_all(MPI_COMM_WORLD);
			
 
				+	starpu_mpi_barrier(MPI_COMM_WORLD);
			
 
				 
			
 
				 	starpu_mpi_datatype_unregister(handle0);
			
 
				 	starpu_data_unregister(handle0);
			
--- a/mpi/src/Makefile.am
+++ b/mpi/src/Makefile.am
@@ -68,6 +68,7 @@ noinst_HEADERS =					\
 
				 	starpu_mpi_cache_stats.h			\
			
 
				 	starpu_mpi_task_insert.h			\
			
 
				 	starpu_mpi_init.h				\
			
 
				+	mpi/starpu_mpi_mpi.h				\
			
 
				 	mpi/starpu_mpi_early_data.h			\
			
 
				 	mpi/starpu_mpi_early_request.h			\
			
 
				 	mpi/starpu_mpi_sync_data.h			\
			
@@ -77,6 +78,7 @@ noinst_HEADERS =					\
 
				 	mpi/starpu_mpi_mpi_backend.h			\
			
 
				 	nmad/starpu_mpi_nmad_backend.h			\
			
 
				 	nmad/starpu_mpi_nmad_unknown_datatype.h		\
			
 
				+	nmad/starpu_mpi_nmad.h				\
			
 
				 	load_balancer/policy/data_movements_interface.h	\
			
 
				 	load_balancer/policy/load_data_interface.h	\
			
 
				 	load_balancer/policy/load_balancer_policy.h
			
--- a/mpi/src/mpi/starpu_mpi_mpi.c
+++ b/mpi/src/mpi/starpu_mpi_mpi.c
@@ -96,7 +96,7 @@ starpu_pthread_queue_t _starpu_mpi_thread_dontsleep;
 
				 /* Count requests posted by the application and not yet submitted to MPI */
			
 
				 static starpu_pthread_mutex_t mutex_posted_requests;
			
 
				 static starpu_pthread_mutex_t mutex_ready_requests;
			
 
				-static int posted_requests = 0, ready_requests = 0, newer_requests, barrier_running = 0;
			
 
				+static int posted_requests = 0, ready_requests = 0, newer_requests, mpi_wait_for_all_running = 0;
			
 
				 
			
 
				 #define _STARPU_MPI_INC_POSTED_REQUESTS(value) { STARPU_PTHREAD_MUTEX_LOCK(&mutex_posted_requests); posted_requests += value; STARPU_PTHREAD_MUTEX_UNLOCK(&mutex_posted_requests); }
			
 
				 #define _STARPU_MPI_INC_READY_REQUESTS(value) { STARPU_PTHREAD_MUTEX_LOCK(&mutex_ready_requests); ready_requests += value; STARPU_PTHREAD_MUTEX_UNLOCK(&mutex_ready_requests); }
			
@@ -761,16 +761,40 @@ static void _starpu_mpi_barrier_func(struct _starpu_mpi_req *barrier_req)
 
				 int _starpu_mpi_barrier(MPI_Comm comm)
			
 
				 {
			
 
				 	struct _starpu_mpi_req *barrier_req;
			
 
				-	int ret = posted_requests+ready_requests;
			
 
				 
			
 
				+	/* Initialize the request structure */
			
 
				+	_starpu_mpi_request_init(&barrier_req);
			
 
				+	barrier_req->prio = INT_MAX;
			
 
				+	barrier_req->func = _starpu_mpi_barrier_func;
			
 
				+	barrier_req->request_type = BARRIER_REQ;
			
 
				+	barrier_req->node_tag.node.comm = comm;
			
 
				+
			
 
				+	_STARPU_MPI_INC_POSTED_REQUESTS(1);
			
 
				+	_starpu_mpi_submit_ready_request(barrier_req);
			
 
				+
			
 
				+	/* We wait for the MPI request to finish */
			
 
				+	STARPU_PTHREAD_MUTEX_LOCK(&barrier_req->backend->req_mutex);
			
 
				+	while (!barrier_req->completed)
			
 
				+		STARPU_PTHREAD_COND_WAIT(&barrier_req->backend->req_cond, &barrier_req->backend->req_mutex);
			
 
				+	STARPU_PTHREAD_MUTEX_UNLOCK(&barrier_req->backend->req_mutex);
			
 
				+
			
 
				+	_starpu_mpi_request_destroy(barrier_req);
			
 
				+	_STARPU_MPI_LOG_OUT();
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+int _starpu_mpi_wait_for_all(MPI_Comm comm)
			
 
				+{
			
 
				+	(void) comm;
			
 
				 	_STARPU_MPI_LOG_IN();
			
 
				 
			
 
				 	/* First wait for *both* all tasks and MPI requests to finish, in case
			
 
				 	 * some tasks generate MPI requests, MPI requests generate tasks, etc.
			
 
				 	 */
			
 
				 	STARPU_PTHREAD_MUTEX_LOCK(&progress_mutex);
			
 
				-	STARPU_MPI_ASSERT_MSG(!barrier_running, "Concurrent starpu_mpi_barrier is not implemented, even on different communicators");
			
 
				-	barrier_running = 1;
			
 
				+	STARPU_MPI_ASSERT_MSG(!mpi_wait_for_all_running, "Concurrent starpu_mpi_wait_for_all is not implemented, even on different communicators");
			
 
				+	mpi_wait_for_all_running = 1;
			
 
				 	do
			
 
				 	{
			
 
				 		while (posted_requests || ready_requests)
			
@@ -786,29 +810,9 @@ int _starpu_mpi_barrier(MPI_Comm comm)
 
				 		 * triggered by tasks completed and triggered tasks between
			
 
				 		 * wait_for_all finished and we take the lock */
			
 
				 	} while (posted_requests || ready_requests || newer_requests);
			
 
				-	barrier_running = 0;
			
 
				+	mpi_wait_for_all_running = 0;
			
 
				 	STARPU_PTHREAD_MUTEX_UNLOCK(&progress_mutex);
			
 
				-
			
 
				-	/* Initialize the request structure */
			
 
				-	_starpu_mpi_request_init(&barrier_req);
			
 
				-	barrier_req->prio = INT_MAX;
			
 
				-	barrier_req->func = _starpu_mpi_barrier_func;
			
 
				-	barrier_req->request_type = BARRIER_REQ;
			
 
				-	barrier_req->node_tag.node.comm = comm;
			
 
				-
			
 
				-	_STARPU_MPI_INC_POSTED_REQUESTS(1);
			
 
				-	_starpu_mpi_submit_ready_request(barrier_req);
			
 
				-
			
 
				-	/* We wait for the MPI request to finish */
			
 
				-	STARPU_PTHREAD_MUTEX_LOCK(&barrier_req->backend->req_mutex);
			
 
				-	while (!barrier_req->completed)
			
 
				-		STARPU_PTHREAD_COND_WAIT(&barrier_req->backend->req_cond, &barrier_req->backend->req_mutex);
			
 
				-	STARPU_PTHREAD_MUTEX_UNLOCK(&barrier_req->backend->req_mutex);
			
 
				-
			
 
				-	_starpu_mpi_request_destroy(barrier_req);
			
 
				-	_STARPU_MPI_LOG_OUT();
			
 
				-
			
 
				-	return ret;
			
 
				+	return 0;
			
 
				 }
			
 
				 
			
 
				 /********************************************************/
			
@@ -1269,7 +1273,7 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
				 			_STARPU_MPI_DEBUG(3, "NO MORE REQUESTS TO HANDLE\n");
			
 
				 			_STARPU_MPI_TRACE_SLEEP_BEGIN();
			
 
				 
			
 
				-			if (barrier_running)
			
 
				+			if (mpi_wait_for_all_running)
			
 
				 				/* Tell mpi_barrier */
			
 
				 				STARPU_PTHREAD_COND_SIGNAL(&barrier_cond);
			
 
				 			STARPU_PTHREAD_COND_WAIT(&progress_cond, &progress_mutex);
			
--- a/mpi/src/mpi/starpu_mpi_mpi.h
+++ b/mpi/src/mpi/starpu_mpi_mpi.h
@@ -0,0 +1,53 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __STARPU_MPI_MPI_H__
			
 
				+#define __STARPU_MPI_MPI_H__
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#include <stdlib.h>
			
 
				+#include <mpi.h>
			
 
				+#include <common/config.h>
			
 
				+#include <common/list.h>
			
 
				+
			
 
				+#ifdef STARPU_USE_MPI_MPI
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C"
			
 
				+{
			
 
				+#endif
			
 
				+
			
 
				+int _starpu_mpi_progress_init(struct _starpu_mpi_argc_argv *argc_argv);
			
 
				+void _starpu_mpi_progress_shutdown(void **value);
			
 
				+
			
 
				+#ifdef STARPU_SIMGRID
			
 
				+void _starpu_mpi_wait_for_initialization();
			
 
				+#endif
			
 
				+
			
 
				+int _starpu_mpi_barrier(MPI_Comm comm);
			
 
				+int _starpu_mpi_wait_for_all(MPI_Comm comm);
			
 
				+int _starpu_mpi_wait(starpu_mpi_req *public_req, MPI_Status *status);
			
 
				+int _starpu_mpi_test(starpu_mpi_req *public_req, int *flag, MPI_Status *status);
			
 
				+
			
 
				+void _starpu_mpi_isend_size_func(struct _starpu_mpi_req *req);
			
 
				+void _starpu_mpi_irecv_size_func(struct _starpu_mpi_req *req);
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#endif /* STARPU_USE_MPI_MPI */
			
 
				+#endif /* __STARPU_MPI_MPI_H__ */
			
--- a/mpi/src/mpi/starpu_mpi_mpi_backend.c
+++ b/mpi/src/mpi/starpu_mpi_mpi_backend.c
@@ -27,6 +27,7 @@
 
				 #include <mpi/starpu_mpi_comm.h>
			
 
				 #include <mpi/starpu_mpi_tag.h>
			
 
				 #include <mpi/starpu_mpi_driver.h>
			
 
				+#include <mpi/starpu_mpi_mpi.h>
			
 
				 
			
 
				 void _starpu_mpi_mpi_backend_init(struct starpu_conf *conf)
			
 
				 {
			
@@ -109,7 +110,21 @@ struct _starpu_mpi_backend _mpi_backend =
 
				 	._starpu_mpi_backend_request_destroy = _starpu_mpi_mpi_backend_request_destroy,
			
 
				 	._starpu_mpi_backend_data_clear = _starpu_mpi_mpi_backend_data_clear,
			
 
				 	._starpu_mpi_backend_data_register = _starpu_mpi_mpi_backend_data_register,
			
 
				-	._starpu_mpi_backend_comm_register = _starpu_mpi_mpi_backend_comm_register
			
 
				+	._starpu_mpi_backend_comm_register = _starpu_mpi_mpi_backend_comm_register,
			
 
				+
			
 
				+	._starpu_mpi_backend_progress_init = _starpu_mpi_progress_init,
			
 
				+	._starpu_mpi_backend_progress_shutdown = _starpu_mpi_progress_shutdown,
			
 
				+#ifdef STARPU_SIMGRID
			
 
				+	._starpu_mpi_backend_wait_for_initialization = _starpu_mpi_wait_for_initialization,
			
 
				+#endif
			
 
				+
			
 
				+	._starpu_mpi_backend_barrier = _starpu_mpi_barrier,
			
 
				+	._starpu_mpi_backend_wait_for_all = _starpu_mpi_wait_for_all,
			
 
				+	._starpu_mpi_backend_wait = _starpu_mpi_wait,
			
 
				+	._starpu_mpi_backend_test = _starpu_mpi_test,
			
 
				+
			
 
				+	._starpu_mpi_backend_isend_size_func = _starpu_mpi_isend_size_func,
			
 
				+	._starpu_mpi_backend_irecv_size_func = _starpu_mpi_irecv_size_func,
			
 
				 };
			
 
				 
			
 
				 #endif /* STARPU_USE_MPI_MPI*/
			
--- a/mpi/src/nmad/starpu_mpi_nmad.c
+++ b/mpi/src/nmad/starpu_mpi_nmad.c
@@ -59,11 +59,15 @@ static starpu_pthread_cond_t progress_cond;
 
				 static starpu_pthread_mutex_t progress_mutex;
			
 
				 static volatile int running = 0;
			
 
				 
			
 
				-extern struct _starpu_mpi_req *_starpu_mpi_irecv_common(starpu_data_handle_t data_handle, int source, int data_tag, MPI_Comm comm, unsigned detached, unsigned sync, void (*callback)(void *), void *arg, int sequential_consistency, int is_internal_req, starpu_ssize_t count);
			
 
				+static starpu_pthread_cond_t mpi_wait_for_all_running_cond;
			
 
				+static int mpi_wait_for_all_running = 0;
			
 
				+static starpu_pthread_mutex_t mpi_wait_for_all_running_mutex;
			
 
				 
			
 
				-/* Count requests posted by the application and not yet submitted to MPI, i.e pushed into the new_requests list */
			
 
				+extern struct _starpu_mpi_req *_starpu_mpi_irecv_common(starpu_data_handle_t data_handle, int source, int data_tag, MPI_Comm comm, unsigned detached, unsigned sync, void (*callback)(void *), void *arg, int sequential_consistency, int is_internal_req, starpu_ssize_t count);
			
 
				 
			
 
				-static volatile int pending_request = 0;
			
 
				+/* Count running requests: this counter is incremented just before StarPU
			
 
				+ * submits a MPI request, and decremented when a MPI request finishes. */
			
 
				+static volatile int nb_pending_requests = 0;
			
 
				 
			
 
				 #define REQ_FINALIZED 0x1
			
 
				 
			
@@ -80,7 +84,7 @@ static starpu_sem_t callback_sem;
 
				 
			
 
				 void _starpu_mpi_req_willpost(struct _starpu_mpi_req *req STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				-	STARPU_ATOMIC_ADD( &pending_request, 1);
			
 
				+	STARPU_ATOMIC_ADD( &nb_pending_requests, 1);
			
 
				 }
			
 
				 
			
 
				 /********************************************************/
			
@@ -269,16 +273,39 @@ int _starpu_mpi_test(starpu_mpi_req *public_req, int *flag, MPI_Status *status)
 
				 int _starpu_mpi_barrier(MPI_Comm comm)
			
 
				 {
			
 
				 	_STARPU_MPI_LOG_IN();
			
 
				-	int ret;
			
 
				-	//	STARPU_ASSERT_MSG(!barrier_running, "Concurrent starpu_mpi_barrier is not implemented, even on different communicators");
			
 
				-	ret = MPI_Barrier(comm);
			
 
				 
			
 
				+	int ret = MPI_Barrier(comm);
			
 
				 	STARPU_ASSERT_MSG(ret == MPI_SUCCESS, "MPI_Barrier returning %d", ret);
			
 
				 
			
 
				 	_STARPU_MPI_LOG_OUT();
			
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				+int _starpu_mpi_wait_for_all(MPI_Comm comm)
			
 
				+{
			
 
				+	(void) comm;
			
 
				+	_STARPU_MPI_LOG_IN();
			
 
				+
			
 
				+	STARPU_PTHREAD_MUTEX_LOCK(&mpi_wait_for_all_running_mutex);
			
 
				+	STARPU_MPI_ASSERT_MSG(!mpi_wait_for_all_running, "Concurrent starpu_mpi_wait_for_all is not implemented, even on different communicators");
			
 
				+	mpi_wait_for_all_running = 1;
			
 
				+	do
			
 
				+	{
			
 
				+		while (nb_pending_requests)
			
 
				+			STARPU_PTHREAD_COND_WAIT(&mpi_wait_for_all_running_cond, &mpi_wait_for_all_running_mutex);
			
 
				+		STARPU_PTHREAD_MUTEX_UNLOCK(&mpi_wait_for_all_running_mutex);
			
 
				+
			
 
				+		starpu_task_wait_for_all();
			
 
				+
			
 
				+		STARPU_PTHREAD_MUTEX_LOCK(&mpi_wait_for_all_running_mutex);
			
 
				+	} while (nb_pending_requests);
			
 
				+	mpi_wait_for_all_running = 0;
			
 
				+	STARPU_PTHREAD_MUTEX_UNLOCK(&mpi_wait_for_all_running_mutex);
			
 
				+
			
 
				+	_STARPU_MPI_LOG_OUT();
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				 /********************************************************/
			
 
				 /*                                                      */
			
 
				 /*  Progression                                         */
			
@@ -353,9 +380,13 @@ void _starpu_mpi_handle_request_termination(struct _starpu_mpi_req *req,nm_sr_ev
 
				 			req->completed = 1;
			
 
				 			piom_cond_signal(&req->backend->req_cond, REQ_FINALIZED);
			
 
				 		}
			
 
				-		int pending_remaining = STARPU_ATOMIC_ADD(&pending_request, -1);
			
 
				-		if (!running && !pending_remaining)
			
 
				-			starpu_sem_post(&callback_sem);
			
 
				+		int pending_remaining = STARPU_ATOMIC_ADD(&nb_pending_requests, -1);
			
 
				+		if (!pending_remaining)
			
 
				+		{
			
 
				+			STARPU_PTHREAD_COND_BROADCAST(&mpi_wait_for_all_running_cond);
			
 
				+			if (!running)
			
 
				+				starpu_sem_post(&callback_sem);
			
 
				+		}
			
 
				 	}
			
 
				 	_STARPU_MPI_LOG_OUT();
			
 
				 }
			
@@ -476,24 +507,24 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
				 		struct callback_lfstack_cell_s* c = callback_lfstack_pop(&callback_stack);
			
 
				 		int err=0;
			
 
				 
			
 
				-		if(running || pending_request>0)
			
 
				+		if(running || nb_pending_requests>0)
			
 
				 		{
			
 
				 			/* shall we block ? */
			
 
				 			err = starpu_sem_wait(&callback_sem);
			
 
				-			//running pending_request can change while waiting
			
 
				+			//running nb_pending_requests can change while waiting
			
 
				 		}
			
 
				 		if(c==NULL)
			
 
				 		{
			
 
				 			c = callback_lfstack_pop(&callback_stack);
			
 
				 			if (c == NULL)
			
 
				 			{
			
 
				-				if(running && pending_request>0)
			
 
				+				if(running && nb_pending_requests>0)
			
 
				 				{
			
 
				 					STARPU_ASSERT_MSG(c!=NULL, "Callback thread awakened without callback ready with error %d.",err);
			
 
				 				}
			
 
				 				else
			
 
				 				{
			
 
				-					if (pending_request==0)
			
 
				+					if (nb_pending_requests==0)
			
 
				 						break;
			
 
				 				}
			
 
				 				continue;
			
@@ -511,14 +542,14 @@ static void *_starpu_mpi_progress_thread_func(void *arg)
 
				 			c->req->completed=1;
			
 
				 			piom_cond_signal(&(c->req->backend->req_cond), REQ_FINALIZED);
			
 
				 		}
			
 
				-		STARPU_ATOMIC_ADD( &pending_request, -1);
			
 
				+		STARPU_ATOMIC_ADD( &nb_pending_requests, -1);
			
 
				 		/* we signal that the request is completed.*/
			
 
				 
			
 
				 		free(c);
			
 
				 
			
 
				 	}
			
 
				 	STARPU_ASSERT_MSG(callback_lfstack_pop(&callback_stack)==NULL, "List of callback not empty.");
			
 
				-	STARPU_ASSERT_MSG(pending_request==0, "Request still pending.");
			
 
				+	STARPU_ASSERT_MSG(nb_pending_requests==0, "Request still pending.");
			
 
				 
			
 
				 	if (argc_argv->initialize_mpi)
			
 
				 	{
			
@@ -580,6 +611,9 @@ int _starpu_mpi_progress_init(struct _starpu_mpi_argc_argv *argc_argv)
 
				         STARPU_PTHREAD_MUTEX_INIT(&progress_mutex, NULL);
			
 
				         STARPU_PTHREAD_COND_INIT(&progress_cond, NULL);
			
 
				 
			
 
				+        STARPU_PTHREAD_MUTEX_INIT(&mpi_wait_for_all_running_mutex, NULL);
			
 
				+        STARPU_PTHREAD_COND_INIT(&mpi_wait_for_all_running_cond, NULL);
			
 
				+
			
 
				 	starpu_sem_init(&callback_sem, 0, 0);
			
 
				 	running = 0;
			
 
				 
			
@@ -669,6 +703,9 @@ void _starpu_mpi_progress_shutdown(void **value)
 
				 
			
 
				         STARPU_PTHREAD_MUTEX_DESTROY(&progress_mutex);
			
 
				         STARPU_PTHREAD_COND_DESTROY(&progress_cond);
			
 
				+
			
 
				+        STARPU_PTHREAD_MUTEX_DESTROY(&mpi_wait_for_all_running_mutex);
			
 
				+        STARPU_PTHREAD_COND_DESTROY(&mpi_wait_for_all_running_cond);
			
 
				 }
			
 
				 
			
 
				 static int64_t _starpu_mpi_tag_max = INT64_MAX;
			
--- a/mpi/src/nmad/starpu_mpi_nmad.h
+++ b/mpi/src/nmad/starpu_mpi_nmad.h
@@ -0,0 +1,53 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2020       Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#ifndef __STARPU_MPI_NMAD_H__
			
 
				+#define __STARPU_MPI_NMAD_H__
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+#include <stdlib.h>
			
 
				+#include <mpi.h>
			
 
				+#include <common/config.h>
			
 
				+#include <common/list.h>
			
 
				+
			
 
				+#ifdef STARPU_USE_MPI_NMAD
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+extern "C"
			
 
				+{
			
 
				+#endif
			
 
				+
			
 
				+int _starpu_mpi_progress_init(struct _starpu_mpi_argc_argv *argc_argv);
			
 
				+void _starpu_mpi_progress_shutdown(void **value);
			
 
				+
			
 
				+//#ifdef STARPU_SIMGRID
			
 
				+//void _starpu_mpi_wait_for_initialization();
			
 
				+//#endif
			
 
				+
			
 
				+int _starpu_mpi_barrier(MPI_Comm comm);
			
 
				+int _starpu_mpi_wait_for_all(MPI_Comm comm);
			
 
				+int _starpu_mpi_wait(starpu_mpi_req *public_req, MPI_Status *status);
			
 
				+int _starpu_mpi_test(starpu_mpi_req *public_req, int *flag, MPI_Status *status);
			
 
				+
			
 
				+void _starpu_mpi_isend_size_func(struct _starpu_mpi_req *req);
			
 
				+void _starpu_mpi_irecv_size_func(struct _starpu_mpi_req *req);
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#endif /* STARPU_USE_MPI_NMAD */
			
 
				+#endif /* __STARPU_MPI_NMAD_H__ */
			
--- a/mpi/src/nmad/starpu_mpi_nmad_backend.c
+++ b/mpi/src/nmad/starpu_mpi_nmad_backend.c
@@ -17,6 +17,7 @@
 
				 #include <stdlib.h>
			
 
				 #include "starpu_mpi_nmad_backend.h"
			
 
				 #include <starpu_mpi_private.h>
			
 
				+#include "starpu_mpi_nmad.h"
			
 
				 
			
 
				 #ifdef STARPU_USE_MPI_NMAD
			
 
				 
			
@@ -91,7 +92,21 @@ struct _starpu_mpi_backend _mpi_backend =
 
				 	._starpu_mpi_backend_request_destroy = _starpu_mpi_nmad_backend_request_destroy,
			
 
				 	._starpu_mpi_backend_data_clear = _starpu_mpi_nmad_backend_data_clear,
			
 
				 	._starpu_mpi_backend_data_register = _starpu_mpi_nmad_backend_data_register,
			
 
				-	._starpu_mpi_backend_comm_register = _starpu_mpi_nmad_backend_comm_register
			
 
				+	._starpu_mpi_backend_comm_register = _starpu_mpi_nmad_backend_comm_register,
			
 
				+
			
 
				+	._starpu_mpi_backend_progress_init = _starpu_mpi_progress_init,
			
 
				+	._starpu_mpi_backend_progress_shutdown = _starpu_mpi_progress_shutdown,
			
 
				+//#ifdef STARPU_SIMGRID
			
 
				+//	._starpu_mpi_backend_wait_for_initialization = _starpu_mpi_wait_for_initialization,
			
 
				+//#endif
			
 
				+
			
 
				+	._starpu_mpi_backend_barrier = _starpu_mpi_barrier,
			
 
				+	._starpu_mpi_backend_wait_for_all = _starpu_mpi_wait_for_all,
			
 
				+	._starpu_mpi_backend_wait = _starpu_mpi_wait,
			
 
				+	._starpu_mpi_backend_test = _starpu_mpi_test,
			
 
				+
			
 
				+	._starpu_mpi_backend_isend_size_func = _starpu_mpi_isend_size_func,
			
 
				+	._starpu_mpi_backend_irecv_size_func = _starpu_mpi_irecv_size_func,
			
 
				 };
			
 
				 
			
 
				 #endif /* STARPU_USE_MPI_NMAD*/
			
--- a/mpi/src/starpu_mpi.c
+++ b/mpi/src/starpu_mpi.c
@@ -70,7 +70,7 @@ static struct _starpu_mpi_req *_starpu_mpi_isend_common(starpu_data_handle_t dat
 
				 	enum starpu_data_access_mode mode = STARPU_R;
			
 
				 #endif
			
 
				 
			
 
				-	struct _starpu_mpi_req *req = _starpu_mpi_request_fill(data_handle, dest, data_tag, comm, detached, sync, prio, callback, arg, SEND_REQ, _starpu_mpi_isend_size_func, sequential_consistency, 0, 0);
			
 
				+	struct _starpu_mpi_req *req = _starpu_mpi_request_fill(data_handle, dest, data_tag, comm, detached, sync, prio, callback, arg, SEND_REQ, _mpi_backend._starpu_mpi_backend_isend_size_func, sequential_consistency, 0, 0);
			
 
				 	_starpu_mpi_req_willpost(req);
			
 
				 
			
 
				 	if (_starpu_mpi_use_coop_sends && detached == 1 && sync == 0 && callback == NULL)
			
@@ -183,7 +183,7 @@ struct _starpu_mpi_req *_starpu_mpi_irecv_common(starpu_data_handle_t data_handl
 
				 		return NULL;
			
 
				 	}
			
 
				 
			
 
				-	struct _starpu_mpi_req *req = _starpu_mpi_request_fill(data_handle, source, data_tag, comm, detached, sync, 0, callback, arg, RECV_REQ, _starpu_mpi_irecv_size_func, sequential_consistency, is_internal_req, count);
			
 
				+	struct _starpu_mpi_req *req = _starpu_mpi_request_fill(data_handle, source, data_tag, comm, detached, sync, 0, callback, arg, RECV_REQ, _mpi_backend._starpu_mpi_backend_irecv_size_func, sequential_consistency, is_internal_req, count);
			
 
				 	_starpu_mpi_req_willpost(req);
			
 
				 	_starpu_mpi_isend_irecv_common(req, STARPU_W, sequential_consistency);
			
 
				 	return req;
			
@@ -240,17 +240,17 @@ int starpu_mpi_recv(starpu_data_handle_t data_handle, int source, starpu_mpi_tag
 
				 
			
 
				 int starpu_mpi_wait(starpu_mpi_req *public_req, MPI_Status *status)
			
 
				 {
			
 
				-	return _starpu_mpi_wait(public_req, status);
			
 
				+	return _mpi_backend._starpu_mpi_backend_wait(public_req, status);
			
 
				 }
			
 
				 
			
 
				 int starpu_mpi_test(starpu_mpi_req *public_req, int *flag, MPI_Status *status)
			
 
				 {
			
 
				-	return _starpu_mpi_test(public_req, flag, status);
			
 
				+	return _mpi_backend._starpu_mpi_backend_test(public_req, flag, status);
			
 
				 }
			
 
				 
			
 
				 int starpu_mpi_barrier(MPI_Comm comm)
			
 
				 {
			
 
				-	return _starpu_mpi_barrier(comm);
			
 
				+	return _mpi_backend._starpu_mpi_backend_barrier(comm);
			
 
				 }
			
 
				 
			
 
				 void _starpu_mpi_data_clear(starpu_data_handle_t data_handle)
			
@@ -437,7 +437,5 @@ void starpu_mpi_data_migrate(MPI_Comm comm, starpu_data_handle_t data, int new_r
 
				 
			
 
				 int starpu_mpi_wait_for_all(MPI_Comm comm)
			
 
				 {
			
 
				-	starpu_task_wait_for_all();
			
 
				-	starpu_mpi_barrier(comm);
			
 
				-	return 0;
			
 
				+	return _mpi_backend._starpu_mpi_backend_wait_for_all(comm);
			
 
				 }
			
--- a/mpi/src/starpu_mpi_init.c
+++ b/mpi/src/starpu_mpi_init.c
@@ -92,6 +92,30 @@ void _starpu_mpi_do_initialize(struct _starpu_mpi_argc_argv *argc_argv)
 
				 }
			
 
				 
			
 
				 static
			
 
				+void _starpu_mpi_backend_check()
			
 
				+{
			
 
				+	STARPU_ASSERT(_mpi_backend._starpu_mpi_backend_init != NULL);
			
 
				+	STARPU_ASSERT(_mpi_backend._starpu_mpi_backend_shutdown != NULL);
			
 
				+	STARPU_ASSERT(_mpi_backend._starpu_mpi_backend_reserve_core != NULL);
			
 
				+	STARPU_ASSERT(_mpi_backend._starpu_mpi_backend_request_init != NULL);
			
 
				+	STARPU_ASSERT(_mpi_backend._starpu_mpi_backend_request_fill != NULL);
			
 
				+	STARPU_ASSERT(_mpi_backend._starpu_mpi_backend_request_destroy != NULL);
			
 
				+	STARPU_ASSERT(_mpi_backend._starpu_mpi_backend_data_clear != NULL);
			
 
				+	STARPU_ASSERT(_mpi_backend._starpu_mpi_backend_data_register != NULL);
			
 
				+	STARPU_ASSERT(_mpi_backend._starpu_mpi_backend_comm_register != NULL);
			
 
				+	STARPU_ASSERT(_mpi_backend._starpu_mpi_backend_progress_init != NULL);
			
 
				+	STARPU_ASSERT(_mpi_backend._starpu_mpi_backend_progress_shutdown != NULL);
			
 
				+#ifdef STARPU_SIMGRID
			
 
				+	STARPU_ASSERT(_mpi_backend._starpu_mpi_backend_wait_for_initialization != NULL);
			
 
				+#endif
			
 
				+	STARPU_ASSERT(_mpi_backend._starpu_mpi_backend_barrier != NULL);
			
 
				+	STARPU_ASSERT(_mpi_backend._starpu_mpi_backend_wait_for_all != NULL);
			
 
				+	STARPU_ASSERT(_mpi_backend._starpu_mpi_backend_wait != NULL);
			
 
				+	STARPU_ASSERT(_mpi_backend._starpu_mpi_backend_test != NULL);
			
 
				+	STARPU_ASSERT(_mpi_backend._starpu_mpi_backend_isend_size_func != NULL);
			
 
				+	STARPU_ASSERT(_mpi_backend._starpu_mpi_backend_irecv_size_func != NULL);
			
 
				+}
			
 
				+static
			
 
				 int _starpu_mpi_initialize(int *argc, char ***argv, int initialize_mpi, MPI_Comm comm)
			
 
				 {
			
 
				 	struct _starpu_mpi_argc_argv *argc_argv;
			
@@ -102,13 +126,15 @@ int _starpu_mpi_initialize(int *argc, char ***argv, int initialize_mpi, MPI_Comm
 
				 	argc_argv->comm = comm;
			
 
				 	_starpu_implicit_data_deps_write_hook(_starpu_mpi_data_flush);
			
 
				 
			
 
				+	_starpu_mpi_backend_check();
			
 
				+
			
 
				 #ifdef STARPU_SIMGRID
			
 
				 	/* Call MPI_Init_thread as early as possible, to initialize simgrid
			
 
				 	 * before working with mutexes etc. */
			
 
				 	_starpu_mpi_do_initialize(argc_argv);
			
 
				 #endif
			
 
				 
			
 
				-	return _starpu_mpi_progress_init(argc_argv);
			
 
				+	return _mpi_backend._starpu_mpi_backend_progress_init(argc_argv);
			
 
				 }
			
 
				 
			
 
				 #ifdef STARPU_SIMGRID
			
@@ -127,7 +153,7 @@ int starpu_mpi_init_comm(int *argc, char ***argv, int initialize_mpi, MPI_Comm c
 
				 	(void)argv;
			
 
				 	(void)initialize_mpi;
			
 
				 	(void)comm;
			
 
				-	_starpu_mpi_wait_for_initialization();
			
 
				+	_mpi_backend._starpu_mpi_backend_wait_for_initialization();
			
 
				 	return 0;
			
 
				 #else
			
 
				 	return _starpu_mpi_initialize(argc, argv, initialize_mpi, comm);
			
@@ -207,7 +233,7 @@ int starpu_mpi_shutdown(void)
 
				 	starpu_mpi_comm_size(MPI_COMM_WORLD, &world_size);
			
 
				 
			
 
				 	/* kill the progression thread */
			
 
				-	_starpu_mpi_progress_shutdown(&value);
			
 
				+	_mpi_backend._starpu_mpi_backend_progress_shutdown(&value);
			
 
				 
			
 
				 #ifdef STARPU_USE_FXT
			
 
				 	if (starpu_fxt_is_enabled())
			
--- a/mpi/src/starpu_mpi_private.h
+++ b/mpi/src/starpu_mpi_private.h
@@ -181,21 +181,21 @@ struct _starpu_mpi_node_tag
 
				 };
			
 
				 
			
 
				 MULTILIST_CREATE_TYPE(_starpu_mpi_req, coop_sends)
			
 
				-/* One bag of cooperative sends */
			
 
				+/** One bag of cooperative sends */
			
 
				 struct _starpu_mpi_coop_sends
			
 
				 {
			
 
				-	/* List of send requests */
			
 
				+	/** List of send requests */
			
 
				 	struct _starpu_mpi_req_multilist_coop_sends reqs;
			
 
				 	struct _starpu_mpi_data *mpi_data;
			
 
				 
			
 
				-	/* Array of send requests, after sorting out */
			
 
				+	/** Array of send requests, after sorting out */
			
 
				 	struct _starpu_spinlock lock;
			
 
				 	struct _starpu_mpi_req **reqs_array;
			
 
				 	unsigned n;
			
 
				 	unsigned redirects_sent;
			
 
				 };
			
 
				 
			
 
				-/* Initialized in starpu_mpi_data_register_comm */
			
 
				+/** Initialized in starpu_mpi_data_register_comm */
			
 
				 struct _starpu_mpi_data
			
 
				 {
			
 
				 	int magic;
			
@@ -203,9 +203,11 @@ struct _starpu_mpi_data
 
				 	int *cache_sent;
			
 
				 	int cache_received;
			
 
				 
			
 
				-	/* Rendez-vous data for opportunistic cooperative sends */
			
 
				-	struct _starpu_spinlock coop_lock; /* Needed to synchronize between submit thread and workers */
			
 
				-	struct _starpu_mpi_coop_sends *coop_sends; /* Current cooperative send bag */
			
 
				+	/** Rendez-vous data for opportunistic cooperative sends */
			
 
				+	/** Needed to synchronize between submit thread and workers */
			
 
				+	struct _starpu_spinlock coop_lock;
			
 
				+	/** Current cooperative send bag */
			
 
				+	struct _starpu_mpi_coop_sends *coop_sends;
			
 
				 };
			
 
				 
			
 
				 struct _starpu_mpi_data *_starpu_mpi_data_get(starpu_data_handle_t data_handle);
			
@@ -213,12 +215,12 @@ struct _starpu_mpi_data *_starpu_mpi_data_get(starpu_data_handle_t data_handle);
 
				 struct _starpu_mpi_req_backend;
			
 
				 struct _starpu_mpi_req;
			
 
				 LIST_TYPE(_starpu_mpi_req,
			
 
				-	/* description of the data at StarPU level */
			
 
				+	/** description of the data at StarPU level */
			
 
				 	starpu_data_handle_t data_handle;
			
 
				 
			
 
				 	int prio;
			
 
				 
			
 
				-	/* description of the data to be sent/received */
			
 
				+	/** description of the data to be sent/received */
			
 
				 	MPI_Datatype datatype;
			
 
				 	char *datatype_name;
			
 
				 	void *ptr;
			
@@ -227,7 +229,7 @@ LIST_TYPE(_starpu_mpi_req,
 
				 
			
 
				 	struct _starpu_mpi_req_backend *backend;
			
 
				 
			
 
				-	/* who are we talking to ? */
			
 
				+	/** who are we talking to ? */
			
 
				 	struct _starpu_mpi_node_tag node_tag;
			
 
				 	void (*func)(struct _starpu_mpi_req *);
			
 
				 
			
@@ -238,7 +240,7 @@ LIST_TYPE(_starpu_mpi_req,
 
				 	int *flag;
			
 
				 	unsigned sync;
			
 
				 
			
 
				-	/* Amount of memory pre-reserved for the reception buffer */
			
 
				+	/** Amount of memory pre-reserved for the reception buffer */
			
 
				 	size_t reserved_size;
			
 
				 
			
 
				 	int ret;
			
@@ -249,13 +251,11 @@ LIST_TYPE(_starpu_mpi_req,
 
				 	unsigned completed;
			
 
				 	unsigned posted;
			
 
				 
			
 
				-	/* in the case of detached requests */
			
 
				+	/** in the case of detached requests */
			
 
				 	int detached;
			
 
				 	void *callback_arg;
			
 
				 	void (*callback)(void *);
			
 
				 
			
 
				-        /* in the case of user-defined datatypes, we need to send the size of the data */
			
 
				-
			
 
				 	int sequential_consistency;
			
 
				 
			
 
				 	long pre_sync_jobid;
			
@@ -271,21 +271,21 @@ PRIO_LIST_TYPE(_starpu_mpi_req, prio)
 
				 
			
 
				 MULTILIST_CREATE_INLINES(struct _starpu_mpi_req, _starpu_mpi_req, coop_sends)
			
 
				 
			
 
				-/* To be called before actually queueing a request, so the communication layer knows it has something to look at */
			
 
				+/** To be called before actually queueing a request, so the communication layer knows it has something to look at */
			
 
				 void _starpu_mpi_req_willpost(struct _starpu_mpi_req *req);
			
 
				-/* To be called to actually submit the request */
			
 
				+/** To be called to actually submit the request */
			
 
				 void _starpu_mpi_submit_ready_request(void *arg);
			
 
				-/* To be called when request is completed */
			
 
				+/** To be called when request is completed */
			
 
				 void _starpu_mpi_release_req_data(struct _starpu_mpi_req *req);
			
 
				 
			
 
				 #if 0
			
 
				-/* Build a communication tree. Called before _starpu_mpi_coop_send is ever called. coop_sends->lock is held. */
			
 
				+/** Build a communication tree. Called before _starpu_mpi_coop_send is ever called. coop_sends->lock is held. */
			
 
				 void _starpu_mpi_coop_sends_build_tree(struct _starpu_mpi_coop_sends *coop_sends);
			
 
				 #endif
			
 
				-/* Try to merge with send request with other send requests */
			
 
				+/** Try to merge with send request with other send requests */
			
 
				 void _starpu_mpi_coop_send(starpu_data_handle_t data_handle, struct _starpu_mpi_req *req, enum starpu_data_access_mode mode, int sequential_consistency);
			
 
				 
			
 
				-/* Actually submit the coop_sends bag to MPI.
			
 
				+/** Actually submit the coop_sends bag to MPI.
			
 
				  * At least one of submit_control or submit_data is true.
			
 
				  * _starpu_mpi_submit_coop_sends may be called either
			
 
				  * - just once with both parameters being true,
			
@@ -306,11 +306,7 @@ struct _starpu_mpi_req * _starpu_mpi_request_fill(starpu_data_handle_t data_hand
 
				 						  starpu_ssize_t count);
			
 
				 
			
 
				 void _starpu_mpi_request_destroy(struct _starpu_mpi_req *req);
			
 
				-void _starpu_mpi_isend_size_func(struct _starpu_mpi_req *req);
			
 
				-void _starpu_mpi_irecv_size_func(struct _starpu_mpi_req *req);
			
 
				-int _starpu_mpi_wait(starpu_mpi_req *public_req, MPI_Status *status);
			
 
				-int _starpu_mpi_test(starpu_mpi_req *public_req, int *flag, MPI_Status *status);
			
 
				-int _starpu_mpi_barrier(MPI_Comm comm);
			
 
				+void _starpu_mpi_data_flush(starpu_data_handle_t data_handle);
			
 
				 
			
 
				 struct _starpu_mpi_argc_argv
			
 
				 {
			
@@ -324,14 +320,7 @@ struct _starpu_mpi_argc_argv
 
				 	int world_size;
			
 
				 };
			
 
				 
			
 
				-void _starpu_mpi_progress_shutdown(void **value);
			
 
				-int _starpu_mpi_progress_init(struct _starpu_mpi_argc_argv *argc_argv);
			
 
				-#ifdef STARPU_SIMGRID
			
 
				-void _starpu_mpi_wait_for_initialization();
			
 
				-#endif
			
 
				-void _starpu_mpi_data_flush(starpu_data_handle_t data_handle);
			
 
				-
			
 
				-/*
			
 
				+/**
			
 
				  * Specific functions to backend implementation
			
 
				  */
			
 
				 struct _starpu_mpi_backend
			
@@ -345,6 +334,20 @@ struct _starpu_mpi_backend
 
				 	void (*_starpu_mpi_backend_data_clear)(starpu_data_handle_t data_handle);
			
 
				 	void (*_starpu_mpi_backend_data_register)(starpu_data_handle_t data_handle, starpu_mpi_tag_t data_tag);
			
 
				 	void (*_starpu_mpi_backend_comm_register)(MPI_Comm comm);
			
 
				+
			
 
				+	int (*_starpu_mpi_backend_progress_init)(struct _starpu_mpi_argc_argv *argc_argv);
			
 
				+	void (*_starpu_mpi_backend_progress_shutdown)(void **value);
			
 
				+#ifdef STARPU_SIMGRID
			
 
				+	void (*_starpu_mpi_backend_wait_for_initialization)();
			
 
				+#endif
			
 
				+
			
 
				+	int (*_starpu_mpi_backend_barrier)(MPI_Comm comm);
			
 
				+	int (*_starpu_mpi_backend_wait_for_all)(MPI_Comm comm);
			
 
				+	int (*_starpu_mpi_backend_wait)(starpu_mpi_req *public_req, MPI_Status *status);
			
 
				+	int (*_starpu_mpi_backend_test)(starpu_mpi_req *public_req, int *flag, MPI_Status *status);
			
 
				+
			
 
				+	void (*_starpu_mpi_backend_isend_size_func)(struct _starpu_mpi_req *req);
			
 
				+	void (*_starpu_mpi_backend_irecv_size_func)(struct _starpu_mpi_req *req);
			
 
				 };
			
 
				 
			
 
				 extern struct _starpu_mpi_backend _mpi_backend;
			
--- a/mpi/tests/abstract_sendrecv_bench.c
+++ b/mpi/tests/abstract_sendrecv_bench.c
@@ -25,6 +25,11 @@ void sendrecv_bench(int mpi_rank, starpu_pthread_barrier_t* thread_barrier)
 
				 
			
 
				 	if (mpi_rank >= 2)
			
 
				 	{
			
 
				+		if (thread_barrier != NULL)
			
 
				+		{
			
 
				+			STARPU_PTHREAD_BARRIER_WAIT(thread_barrier);
			
 
				+		}
			
 
				+
			
 
				 		for (uint64_t s = NX_MIN; s <= NX_MAX; s = bench_next_size(s))
			
 
				 		{
			
 
				 			iterations = bench_nb_iterations(iterations, s);
			
--- a/mpi/tests/bench_helper.c
+++ b/mpi/tests/bench_helper.c
@@ -33,7 +33,7 @@ int comp_double(const void*_a, const void*_b)
 
				 
			
 
				 uint64_t bench_next_size(uint64_t len)
			
 
				 {
			
 
				-	uint64_t next = len * MULT_DEFAULT + INCR_DEFAULT;
			
 
				+	uint64_t next = len * MULT_DEFAULT;
			
 
				 
			
 
				 	if(next <= len)
			
 
				 		next++;
			
--- a/mpi/tests/bench_helper.h
+++ b/mpi/tests/bench_helper.h
@@ -18,18 +18,16 @@
 
				 #include <starpu_mpi.h>
			
 
				 #include "helper.h"
			
 
				 
			
 
				-#define NX_MAX (512 * 1024 * 1024) // kB
			
 
				 #define NX_MIN 0
			
 
				+
			
 
				 #ifdef STARPU_QUICK_CHECK
			
 
				-#define MULT_DEFAULT 4
			
 
				-#else
			
 
				-#define MULT_DEFAULT 2
			
 
				-#endif
			
 
				-#define INCR_DEFAULT 0
			
 
				-#ifdef STARPU_QUICK_CHECK
			
 
				-#define LOOPS_DEFAULT 100
			
 
				+	#define MULT_DEFAULT 4
			
 
				+	#define LOOPS_DEFAULT 100
			
 
				+	#define NX_MAX (64 * 1024 * 1024) // kB
			
 
				 #else
			
 
				-#define LOOPS_DEFAULT 100000
			
 
				+	#define MULT_DEFAULT 2
			
 
				+	#define LOOPS_DEFAULT 100000
			
 
				+	#define NX_MAX (512 * 1024 * 1024) // kB
			
 
				 #endif
			
 
				 
			
 
				 int comp_double(const void*_a, const void*_b);
			
--- a/mpi/tests/sendrecv_gemm_bench.c
+++ b/mpi/tests/sendrecv_gemm_bench.c
@@ -320,13 +320,6 @@ static void* comm_thread_func(void* arg)
 
				 	return NULL;
			
 
				 }
			
 
				 
			
 
				-#ifdef STARPU_USE_MPI_MPI
			
 
				-int main(int argc, char **argv)
			
 
				-{
			
 
				-	FPRINTF(stderr, "This test does not work with the MPI backend.\n");
			
 
				-	return STARPU_TEST_SKIPPED;
			
 
				-}
			
 
				-#else
			
 
				 int main(int argc, char **argv)
			
 
				 {
			
 
				 	double start, end;
			
@@ -467,4 +460,3 @@ enodev:
 
				 
			
 
				 	return ret;
			
 
				 }
			
 
				-#endif
			
--- a/mpi/tests/sendrecv_parallel_tasks_bench.c
+++ b/mpi/tests/sendrecv_parallel_tasks_bench.c
@@ -112,6 +112,8 @@ void cpu_task(void* descr[], void* args)
 
				 			current_worker, (long long) s, min_lat, bw_million_byte, bw_mbyte, d1_lat, med_lat, avg_lat, d9_lat, max_lat);
			
 
				 		fflush(stdout);
			
 
				 	}
			
 
				+
			
 
				+	free(lats);
			
 
				 }
			
 
				 
			
 
				 static struct starpu_codelet cl =
			
--- a/src/debug/traces/starpu_fxt.c
+++ b/src/debug/traces/starpu_fxt.c
@@ -1196,8 +1196,8 @@ static void handle_new_mem_node(struct fxt_ev_64 *ev, struct starpu_fxt_options
 
				  */
			
 
				 static int create_ordered_stream_id (int nodeid, int devid)
			
 
				 {
			
 
				-	static int stable[STARPU_MAXNODES][STARPU_MAXCUDADEVS];
			
 
				-	STARPU_ASSERT(nodeid < STARPU_MAXNODES);
			
 
				+	static int stable[MAX_MPI_NODES][STARPU_MAXCUDADEVS];
			
 
				+	STARPU_ASSERT(nodeid < MAX_MPI_NODES);
			
 
				 	STARPU_ASSERT(devid < STARPU_MAXCUDADEVS);
			
 
				 	return stable[nodeid][devid]++;
			
 
				 }
			
--- a/src/debug/traces/starpu_fxt.h
+++ b/src/debug/traces/starpu_fxt.h
@@ -41,6 +41,8 @@
 
				 #include <starpu.h>
			
 
				 #include "../../../include/starpu_fxt.h"
			
 
				 
			
 
				+#define MAX_MPI_NODES 64
			
 
				+
			
 
				 extern char _starpu_last_codelet_symbol[STARPU_NMAXWORKERS][(FXT_MAX_PARAMS-5)*sizeof(unsigned long)];
			
 
				 
			
 
				 void _starpu_fxt_dag_init(char *dag_filename);
			
--- a/src/debug/traces/starpu_fxt_mpi.c
+++ b/src/debug/traces/starpu_fxt_mpi.c
@@ -26,8 +26,6 @@
 
				 #define STARPU_POTI_STR_LEN 200
			
 
				 #endif
			
 
				 
			
 
				-#define MAX_MPI_NODES 64
			
 
				-
			
 
				 LIST_TYPE(mpi_transfer,
			
 
				 	unsigned matched;
			
 
				 	int src;
			
--- a/tools/gdbinit
+++ b/tools/gdbinit
@@ -72,7 +72,7 @@ define starpu-print-task
 
				     printf "\tname:\t\t\t\t<%s>\n", $task->name
			
 
				   end
			
 
				   printf "\tcodelet:\t\t\t<%p>\n", $task->cl
			
 
				-  if $task->cl
			
 
				+  if $task->cl && $task->cl->name
			
 
				     printf "\tcodelet name:\t\t\t<%s>\n", $task->cl->name
			
 
				   end
			
 
				   printf "\tcallback:\t\t\t<%p>\n", $task->callback_func