5 years ago · 160bce2876
--- a/ChangeLog
+++ b/ChangeLog
@@ -18,6 +18,7 @@ StarPU 1.4.0 (git revision xxxx)
 
				 ==============================================
			
 
				 New features:
			
 
				   * Fault tolerance support with starpu_task_ft_failed().
			
 
				+  * Julia programming interface.
			
 
				   * Add get_max_size method to data interfaces for applications using data with
			
 
				     variable size to express their maximal potential size.
			
 
				   * New offline tool to draw graph showing elapsed time between sent
			
@@ -52,6 +53,7 @@ Small features:
 
				   * Add STARPU_LIMIT_CPU_NUMA_MEM environment variable.
			
 
				   * Add STARPU_WORKERS_GETBIND environment variable.
			
 
				   * Add STARPU_SCHED_SIMPLE_DECIDE_ALWAYS modular scheduler flag.
			
 
				+  * And STARPU_LIMIT_BANDWIDTH environment variable.
			
 
				 
			
 
				 StarPU 1.3.3 (git revision 11afc5b007fe1ab1c729b55b47a5a98ef7f3cfad)
			
 
				 ====================================================================
			
--- a/configure.ac
+++ b/configure.ac
@@ -88,11 +88,21 @@ AC_CHECK_PROGS(PROG_DATE,gdate date)
 
				 dnl locate pkg-config
			
 
				 PKG_PROG_PKG_CONFIG
			
 
				 
			
 
				+AC_ARG_ENABLE(simgrid, [AS_HELP_STRING([--enable-simgrid],
			
 
				+			[Enable simulating execution in simgrid])],
			
 
				+			enable_simgrid=$enableval, enable_simgrid=no)
			
 
				+
			
 
				 if test x$enable_perf_debug = xyes; then
			
 
				     enable_shared=no
			
 
				 fi
			
 
				+
			
 
				 default_enable_mpi_check=maybe
			
 
				-default_enable_mpi=maybe
			
 
				+
			
 
				+if test x$enable_simgrid = xyes ; then
			
 
				+	default_enable_mpi=no
			
 
				+else
			
 
				+	default_enable_mpi=maybe
			
 
				+fi
			
 
				 
			
 
				 ###############################################################################
			
 
				 #                                                                             #
			
@@ -235,9 +245,6 @@ AC_ARG_WITH(simgrid-lib-dir,
 
				 		enable_simgrid=yes
			
 
				 	], [simgrid_lib_dir=no])
			
 
				 
			
 
				-AC_ARG_ENABLE(simgrid, [AS_HELP_STRING([--enable-simgrid],
			
 
				-			[Enable simulating execution in simgrid])],
			
 
				-			enable_simgrid=$enableval, enable_simgrid=no)
			
 
				 if test x$enable_simgrid = xyes ; then
			
 
				    	if test -n "$SIMGRID_CFLAGS" ; then
			
 
				 	   	CFLAGS="$SIMGRID_CFLAGS $CFLAGS"
			
@@ -290,7 +297,7 @@ if test x$enable_simgrid = xyes ; then
 
				 
			
 
				 	# Latest functions
			
 
				 	AC_CHECK_FUNCS([MSG_process_attach sg_actor_attach sg_actor_init sg_actor_set_stacksize MSG_zone_get_hosts sg_zone_get_hosts MSG_process_self_name MSG_process_userdata_init sg_actor_data])
			
 
				-	AC_CHECK_FUNCS([xbt_mutex_try_acquire smpi_process_set_user_data SMPI_thread_create sg_zone_get_by_name sg_link_name sg_host_route sg_host_self sg_host_list sg_host_speed simcall_process_create sg_config_continue_after_help])
			
 
				+	AC_CHECK_FUNCS([xbt_mutex_try_acquire smpi_process_set_user_data SMPI_thread_create sg_zone_get_by_name sg_link_name sg_link_bandwidth_set sg_host_route sg_host_self sg_host_list sg_host_speed simcall_process_create sg_config_continue_after_help])
			
 
				 	AC_CHECK_FUNCS([simgrid_init], [AC_DEFINE([STARPU_SIMGRID_HAVE_SIMGRID_INIT], [1], [Define to 1 if you have the `simgrid_init' function.])])
			
 
				 	AC_CHECK_FUNCS([xbt_barrier_init], [AC_DEFINE([STARPU_SIMGRID_HAVE_XBT_BARRIER_INIT], [1], [Define to 1 if you have the `xbt_barrier_init' function.])])
			
 
				 	AC_CHECK_FUNCS([sg_actor_sleep_for sg_actor_self sg_actor_ref sg_host_get_properties sg_host_send_to sg_host_sendto sg_cfg_set_int sg_actor_self_execute sg_actor_execute simgrid_get_clock])
			
@@ -423,25 +430,43 @@ if test x$enable_simgrid = xyes ; then
 
				 else
			
 
				     DEFAULT_MPICC=mpicc
			
 
				 fi
			
 
				-AC_ARG_WITH(mpicc, [AS_HELP_STRING([--with-mpicc=<path to mpicc>], [Path of the mpicc compiler])], [DEFAULT_MPICC=$withval])
			
 
				-AC_PATH_PROG(mpicc_path, $DEFAULT_MPICC, [no], [$simgrid_dir/bin:$PATH])
			
 
				+AC_ARG_WITH(mpicc, [AS_HELP_STRING([--with-mpicc=<mpicc name or path to mpicc>], [Name or path of the mpicc compiler])], [DEFAULT_MPICC=$withval])
			
 
				+case $DEFAULT_MPICC in
			
 
				+	/*) mpicc_path="$DEFAULT_MPICC" ;;
			
 
				+	*)  AC_PATH_PROG(mpicc_path, $DEFAULT_MPICC, [no], [$simgrid_dir/bin:$PATH]) ;;
			
 
				+esac
			
 
				+# We test if the MPICC compiler exists
			
 
				+if test ! -x $mpicc_path; then
			
 
				+    AC_MSG_RESULT(The mpicc compiler '$mpicc_path' does not have the execute permission)
			
 
				+    mpicc_path=no
			
 
				+fi
			
 
				+
			
 
				 AC_MSG_CHECKING(whether mpicc is available)
			
 
				 AC_MSG_RESULT($mpicc_path)
			
 
				 AC_SUBST(MPICC, $mpicc_path)
			
 
				 
			
 
				+if test x$mpicc_path != xno ; then
			
 
				+    MPIPATH=$(dirname $mpicc_path):$PATH
			
 
				+else
			
 
				+    MPIPATH=$PATH
			
 
				+fi
			
 
				+
			
 
				 #Check MPICXX/MPIC++
			
 
				 if test x$enable_simgrid = xyes ; then
			
 
				     DEFAULT_MPICXX=smpicxx
			
 
				 else
			
 
				     DEFAULT_MPICXX=mpicxx
			
 
				 fi
			
 
				-AC_ARG_WITH(mpicxx, [AS_HELP_STRING([--with-mpicxx=<path to mpicxx>], [Path of the mpicxx/mpic++ compiler])], [DEFAULT_MPICXX=$withval])
			
 
				-AC_PATH_PROG(mpicxx_path, $DEFAULT_MPICXX, [no], [$simgrid_dir/bin:$PATH])
			
 
				+AC_ARG_WITH(mpicxx, [AS_HELP_STRING([--with-mpicxx=<mpicxx name or path to mpicxx>], [Name or path of the mpicxx/mpic++ compiler])], [DEFAULT_MPICXX=$withval])
			
 
				+case $DEFAULT_MPICXX in
			
 
				+	/*) mpicxx_path="$DEFAULT_MPICXX" ;;
			
 
				+	*)  AC_PATH_PROG(mpicxx_path, $DEFAULT_MPICXX, [no], [$MPIPATH]) ;;
			
 
				+esac
			
 
				 
			
 
				 # try with mpic++ if mpicxx was not found
			
 
				 if test x$mpicxx_path = xno ; then
			
 
				     DEFAULT_MPICXX=mpic++
			
 
				-    AC_PATH_PROG(mpicxx_path, $DEFAULT_MPICXX, [no], [$simgrid_dir/bin:$PATH])
			
 
				+    AC_PATH_PROG(mpicxx_path, $DEFAULT_MPICXX, [no], [$MPIPATH])
			
 
				 fi
			
 
				 
			
 
				 # We test if the MPICXX/MPIC++ compiler exists
			
@@ -454,6 +479,30 @@ AC_MSG_CHECKING(whether mpicxx is available)
 
				 AC_MSG_RESULT($mpicxx_path)
			
 
				 AC_SUBST(MPICXX, $mpicxx_path)
			
 
				 
			
 
				+# Check if mpiexec is available
			
 
				+if test x$enable_simgrid = xyes ; then
			
 
				+    DEFAULT_MPIEXEC=smpirun
			
 
				+    AC_ARG_WITH(smpirun, [AS_HELP_STRING([--with-smpirun[=<name of smpirun or path to smpirun>]], [Name or path of the smpirun helper])], [DEFAULT_MPIEXEC=$withval])
			
 
				+else
			
 
				+    DEFAULT_MPIEXEC=mpiexec
			
 
				+    AC_ARG_WITH(mpiexec, [AS_HELP_STRING([--with-mpiexec=<name of mpiexec or path to mpiexec>], [Name or path of mpiexec])], [DEFAULT_MPIEXEC=$withval])
			
 
				+fi
			
 
				+
			
 
				+case $DEFAULT_MPIEXEC in
			
 
				+    /*) mpiexec_path="$DEFAULT_MPIEXEC" ;;
			
 
				+    *)  AC_PATH_PROG(mpiexec_path, $DEFAULT_MPIEXEC, [no], [$MPIPATH])
			
 
				+esac
			
 
				+AC_MSG_CHECKING(whether mpiexec is available)
			
 
				+AC_MSG_RESULT($mpiexec_path)
			
 
				+
			
 
				+# We test if MPIEXEC exists
			
 
				+if test ! -x $mpiexec_path; then
			
 
				+    AC_MSG_RESULT(The mpiexec script '$mpiexec_path' is not valid)
			
 
				+    default_enable_mpi_check=no
			
 
				+    mpiexec_path=""
			
 
				+fi
			
 
				+AC_SUBST(MPIEXEC,$mpiexec_path)
			
 
				+
			
 
				 ###############################################################################
			
 
				 #                                                                             #
			
 
				 #                                    MPI                                      #
			
@@ -586,33 +635,6 @@ if test x$enable_mpi = xno ; then
 
				     running_mpi_check=no
			
 
				 fi
			
 
				 
			
 
				-if test x$enable_mpi = xyes -a x$running_mpi_check = xyes ; then
			
 
				-    # Check if mpiexec is available
			
 
				-    if test x$enable_simgrid = xyes ; then
			
 
				-	DEFAULT_MPIEXEC=smpirun
			
 
				-        AC_ARG_WITH(smpirun, [AS_HELP_STRING([--with-smpirun[=<path to smpirun>]], [Path of the smpirun helper])], [DEFAULT_MPIEXEC=$withval])
			
 
				-	AC_PATH_PROG(mpiexec_path, $DEFAULT_MPIEXEC, [no], [$simgrid_dir/bin:$PATH])
			
 
				-    else
			
 
				-	DEFAULT_MPIEXEC=mpiexec
			
 
				-	AC_ARG_WITH(mpiexec, [AS_HELP_STRING([--with-mpiexec=<path to mpiexec>], [Path of mpiexec])], [DEFAULT_MPIEXEC=$withval])
			
 
				-	if test x$mpicc_path = x ; then
			
 
				-	    AC_PATH_PROG(mpiexec_path, $DEFAULT_MPIEXEC, [no], [$PATH])
			
 
				-	else
			
 
				-	    AC_PATH_PROG(mpiexec_path, $DEFAULT_MPIEXEC, [no], [$(dirname $mpicc_path):$PATH])
			
 
				-	fi
			
 
				-    fi
			
 
				-    AC_MSG_CHECKING(whether mpiexec is available)
			
 
				-    AC_MSG_RESULT($mpiexec_path)
			
 
				-
			
 
				-    # We test if MPIEXEC exists
			
 
				-    if test ! -x $mpiexec_path; then
			
 
				-        AC_MSG_RESULT(The mpiexec script '$mpiexec_path' is not valid)
			
 
				-        running_mpi_check=no
			
 
				-        mpiexec_path=""
			
 
				-    fi
			
 
				-    AC_SUBST(MPIEXEC,$mpiexec_path)
			
 
				-fi
			
 
				-
			
 
				 AM_CONDITIONAL(STARPU_MPI_CHECK, test x$running_mpi_check = xyes)
			
 
				 AC_MSG_CHECKING(whether MPI tests should be run)
			
 
				 AC_MSG_RESULT($running_mpi_check)
			
@@ -2293,9 +2315,9 @@ AC_MSG_RESULT($nmaxbuffers)
 
				 AC_DEFINE_UNQUOTED(STARPU_NMAXBUFS, [$nmaxbuffers],
			
 
				 		[how many buffers can be manipulated per task])
			
 
				 
			
 
				-AC_MSG_CHECKING(maximum number of nodes to use)
			
 
				+AC_MSG_CHECKING(maximum number of memory nodes to use per MPI rank)
			
 
				 AC_ARG_ENABLE(maxnodes, [AS_HELP_STRING([--enable-maxnodes=<nnodes>],
			
 
				-			[maximum number of nodes])],
			
 
				+			[maximum number of memory nodes per MPI rank])],
			
 
				 			maxnodes=$enableval, maxnodes=0)
			
 
				 
			
 
				 if test x$maxnodes = x0 ; then
			
@@ -2576,16 +2598,19 @@ enable_build_fortran=no
 
				 if test "x$enable_build_fortran_requested" = "xyes" ; then
			
 
				    if test "x$FC" != "x"; then
			
 
				    	if $FC --version|grep -q 'GNU Fortran'; then
			
 
				-		 AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
			
 
				-     	         #if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 9)
			
 
				-                 int dummy;
			
 
				-                 #else
			
 
				-                 #error GFortran too old, version >= 4.9.x needed, Fortran examples will not be built
			
 
				-                 #endif
			
 
				-                 ]],
			
 
				+		 AC_LANG_PUSH([Fortran])
			
 
				+		 OLD_FCFLAGS="$FCFLAGS"
			
 
				+		 FCFLAGS="$FCFLAGS -cpp"
			
 
				+		 AC_COMPILE_IFELSE([AC_LANG_PROGRAM([], [[
			
 
				+#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 9)
			
 
				+#error GFortran too old, version >= 4.9.x needed, Fortran examples will not be built
			
 
				+#endif
			
 
				+]]
			
 
				                  )],
			
 
				                  [enable_build_fortran="yes"],
			
 
				                  [enable_build_fortran="no"])
			
 
				+		 FCFLAGS="$OLD_FCFLAGS"
			
 
				+		 AC_LANG_POP([Fortran])
			
 
				                  if test "$enable_build_fortran" = "no" ; then
			
 
				                    AC_MSG_WARN([GFortran too old, version >= 4.9.x needed, Fortran examples will not be built])
			
 
				                  fi
			
@@ -2628,8 +2653,10 @@ if test "x$enable_build_fortran_requested" = "xyes" ; then
 
				 					      else
			
 
				 						      DEFAULT_MPIFORT=mpif90
			
 
				 					      fi
			
 
				-					      # nothing was specified: default value is used
			
 
				-					      AC_PATH_PROG(mpifort_path, $DEFAULT_MPIFORT, [no], [$(dirname $mpicc_path):$simgrid_dir/bin:$PATH])
			
 
				+					      case $DEFAULT_MPIFORT in
			
 
				+					      	/*) mpifort_path="$DEFAULT_MPIFORT" ;;
			
 
				+					        *)  AC_PATH_PROG(mpifort_path, $DEFAULT_MPIFORT, [no], [$MPIPATH])
			
 
				+					      esac
			
 
				 					      ])
			
 
				 
			
 
				 			# We test if the MPIFORT compiler exists
			
@@ -3183,10 +3210,15 @@ AC_ARG_WITH([hwloc],
 
				 				if test ! -d "$withval" ; then
			
 
				 				   AC_MSG_ERROR("Directory specified for hwloc <$withval> does not exist")
			
 
				 				fi
			
 
				-				if test ! -d "$withval/lib/pkgconfig" ; then
			
 
				-				   AC_MSG_ERROR("Hwloc directory <$withval> does not have a subdirectory lib/pkgconfig")
			
 
				+				if test -d "$withval/lib64/pkgconfig" ; then
			
 
				+				   export PKG_CONFIG_PATH=$withval/lib64/pkgconfig:$PKG_CONFIG_PATH
			
 
				+			        else
			
 
				+				   if test -d "$withval/lib/pkgconfig" ; then
			
 
				+				      export PKG_CONFIG_PATH=$withval/lib/pkgconfig:$PKG_CONFIG_PATH
			
 
				+				   else
			
 
				+				      AC_MSG_ERROR("Hwloc directory <$withval> does not have a subdirectory lib/pkgconfig or lib64/pkgconfig")
			
 
				+				   fi
			
 
				 				fi
			
 
				-				export PKG_CONFIG_PATH=$withval/lib/pkgconfig:$PKG_CONFIG_PATH
			
 
				 				use_hwloc=yes
			
 
				 			fi
			
 
				 		else
			
@@ -3594,7 +3626,6 @@ AC_OUTPUT([
 
				 	Makefile
			
 
				 	src/Makefile
			
 
				 	tools/Makefile
			
 
				-	tools/replay-mpi/Makefile
			
 
				 	tools/starpu_env
			
 
				 	tools/starpu_codelet_profile
			
 
				 	tools/starpu_codelet_histo_profile
			
@@ -3645,6 +3676,7 @@ AC_OUTPUT([
 
				 	mpi/src/Makefile
			
 
				 	mpi/tests/Makefile
			
 
				 	mpi/examples/Makefile
			
 
				+	mpi/tools/Makefile
			
 
				 	sc_hypervisor/Makefile
			
 
				 	sc_hypervisor/src/Makefile
			
 
				 	sc_hypervisor/examples/Makefile
			
--- a/doc/doxygen/chapters/101_building.doxy
+++ b/doc/doxygen/chapters/101_building.doxy
@@ -43,8 +43,10 @@ can be used to install StarPU.
 
				 The <c>hwloc</c> (http://www.open-mpi.org/software/hwloc) topology
			
 
				 discovery library is not mandatory to use StarPU but strongly
			
 
				 recommended.  It allows for topology aware scheduling, which improves
			
 
				-performance.  <c>libhwloc</c> is available in major free operating system
			
 
				-distributions, and for most operating systems.
			
 
				+performance. <c>hwloc</c> is available in major free operating system
			
 
				+distributions, and for most operating systems. Make sure to not only install
			
 
				+a <c>hwloc</c> or <c>libhwloc</c> package, but also <c>hwloc-devel</c> or
			
 
				+<c>libhwloc-dev</c> so as to have hwloc headers etc.
			
 
				 
			
 
				 If <c>libhwloc</c> is installed in a standard
			
 
				 location, no option is required, it will be detected automatically,
			
--- a/doc/doxygen/chapters/320_scheduling.doxy
+++ b/doc/doxygen/chapters/320_scheduling.doxy
@@ -39,33 +39,33 @@ STARPU_SCHED. For instance <c>export STARPU_SCHED=dmda</c> . Use <c>help</c> to
 
				 get the list of available schedulers.
			
 
				 
			
 
				 
			
 
				-<b>Non Performance Modelling Policies:</b>
			
 
				+\subsection NonPerformanceModelingPolicies Non Performance Modelling Policies
			
 
				 
			
 
				-The <b>eager</b> scheduler uses a central task queue, from which all workers draw tasks
			
 
				+- The <b>eager</b> scheduler uses a central task queue, from which all workers draw tasks
			
 
				 to work on concurrently. This however does not permit to prefetch data since the scheduling
			
 
				 decision is taken late. If a task has a non-0 priority, it is put at the front of the queue.
			
 
				 
			
 
				-The <b>random</b> scheduler uses a queue per worker, and distributes tasks randomly according to assumed worker
			
 
				+- The <b>random</b> scheduler uses a queue per worker, and distributes tasks randomly according to assumed worker
			
 
				 overall performance.
			
 
				 
			
 
				-The <b>ws</b> (work stealing) scheduler uses a queue per worker, and schedules
			
 
				+- The <b>ws</b> (work stealing) scheduler uses a queue per worker, and schedules
			
 
				 a task on the worker which released it by
			
 
				 default. When a worker becomes idle, it steals a task from the most loaded
			
 
				 worker.
			
 
				 
			
 
				-The <b>lws</b> (locality work stealing) scheduler uses a queue per worker, and schedules
			
 
				+- The <b>lws</b> (locality work stealing) scheduler uses a queue per worker, and schedules
			
 
				 a task on the worker which released it by
			
 
				 default. When a worker becomes idle, it steals a task from neighbour workers. It
			
 
				 also takes into account priorities.
			
 
				 
			
 
				-The <b>prio</b> scheduler also uses a central task queue, but sorts tasks by
			
 
				+- The <b>prio</b> scheduler also uses a central task queue, but sorts tasks by
			
 
				 priority specified by the programmer (between -5 and 5).
			
 
				 
			
 
				-The <b>heteroprio</b> scheduler uses different priorities for the different processing units.
			
 
				+- The <b>heteroprio</b> scheduler uses different priorities for the different processing units.
			
 
				 This scheduler must be configured to work correclty and to expect high-performance
			
 
				 as described in the corresponding section.
			
 
				 
			
 
				-\section DMTaskSchedulingPolicy Performance Model-Based Task Scheduling Policies
			
 
				+\subsection DMTaskSchedulingPolicy Performance Model-Based Task Scheduling Policies
			
 
				 
			
 
				 If (<b>and only if</b>) your application <b>codelets have performance models</b> (\ref
			
 
				 PerformanceModelExample), you should change the scheduler thanks to the
			
@@ -87,47 +87,84 @@ family policy using performance model hints. A low or zero percentage may be
 
				 the sign that performance models are not converging or that codelets do not
			
 
				 have performance models enabled.
			
 
				 
			
 
				-<b>Performance Modelling Policies:</b>
			
 
				-
			
 
				-The <b>dm</b> (deque model) scheduler takes task execution performance models into account to
			
 
				+- The <b>dm</b> (deque model) scheduler takes task execution performance models into account to
			
 
				 perform a HEFT-similar scheduling strategy: it schedules tasks where their
			
 
				 termination time will be minimal. The difference with HEFT is that <b>dm</b>
			
 
				 schedules tasks as soon as they become available, and thus in the order they
			
 
				 become available, without taking priorities into account.
			
 
				 
			
 
				-The <b>dmda</b> (deque model data aware) scheduler is similar to dm, but it also takes
			
 
				+- The <b>dmda</b> (deque model data aware) scheduler is similar to dm, but it also takes
			
 
				 into account data transfer time.
			
 
				 
			
 
				-The <b>dmdap</b> (deque model data aware prio) scheduler is similar to dmda,
			
 
				+- The <b>dmdap</b> (deque model data aware prio) scheduler is similar to dmda,
			
 
				 except that it sorts tasks by priority order, which allows to become even closer
			
 
				 to HEFT by respecting priorities after having made the scheduling decision (but
			
 
				 it still schedules tasks in the order they become available).
			
 
				 
			
 
				-The <b>dmdar</b> (deque model data aware ready) scheduler is similar to dmda,
			
 
				+- The <b>dmdar</b> (deque model data aware ready) scheduler is similar to dmda,
			
 
				 but it also privileges tasks whose data buffers are already available
			
 
				 on the target device.
			
 
				 
			
 
				-The <b>dmdas</b> combines dmdap and dmdas: it sorts tasks by priority order,
			
 
				+- The <b>dmdas</b> combines dmdap and dmdas: it sorts tasks by priority order,
			
 
				 but for a given priority it will privilege tasks whose data buffers are already
			
 
				 available on the target device.
			
 
				 
			
 
				-The <b>dmdasd</b> (deque model data aware sorted decision) scheduler is similar
			
 
				+- The <b>dmdasd</b> (deque model data aware sorted decision) scheduler is similar
			
 
				 to dmdas, except that when scheduling a task, it takes into account its priority
			
 
				 when computing the minimum completion time, since this task may get executed
			
 
				 before others, and thus the latter should be ignored.
			
 
				 
			
 
				-The <b>heft</b> (heterogeneous earliest finish time) scheduler is a deprecated
			
 
				+- The <b>heft</b> (heterogeneous earliest finish time) scheduler is a deprecated
			
 
				 alias for <b>dmda</b>.
			
 
				 
			
 
				-The <b>pheft</b> (parallel HEFT) scheduler is similar to dmda, it also supports
			
 
				+- The <b>pheft</b> (parallel HEFT) scheduler is similar to dmda, it also supports
			
 
				 parallel tasks (still experimental). Should not be used when several contexts using
			
 
				 it are being executed simultaneously.
			
 
				 
			
 
				-The <b>peager</b> (parallel eager) scheduler is similar to eager, it also
			
 
				+- The <b>peager</b> (parallel eager) scheduler is similar to eager, it also
			
 
				 supports parallel tasks (still experimental). Should not be used when several 
			
 
				 contexts using it are being executed simultaneously.
			
 
				 
			
 
				-TODO: describe modular schedulers
			
 
				+\subsection ExistingModularizedSchedulers Modularized Schedulers
			
 
				+
			
 
				+StarPU provides a powerful way to implement schedulers, as documented in \ref
			
 
				+DefiningANewModularSchedulingPolicy . It is currently shipped with the following
			
 
				+pre-defined Modularized Schedulers :
			
 
				+
			
 
				+
			
 
				+- <b>modular-eager</b> , <b>modular-eager-prefetching</b> are eager-based Schedulers (without and with prefetching)), they are \n
			
 
				+naive schedulers, which try to map a task on the first available resource
			
 
				+they find. The prefetching variant queues several tasks in advance to be able to
			
 
				+do data prefetching. This may however degrade load balancing a bit.
			
 
				+
			
 
				+- <b>modular-prio</b>, <b>modular-prio-prefetching</b>, <b>modular-eager-prio</b> are prio-based Schedulers (without / with prefetching):,
			
 
				+similar to Eager-Based Schedulers. Can handle tasks which have a defined
			
 
				+priority and schedule them accordingly.
			
 
				+The <b>modular-eager-prio</b> variant integrates the eager and priority queue in a
			
 
				+single component. This allows it to do a better job at pushing tasks.
			
 
				+
			
 
				+- <b>modular-random</b>, <b>modular-random-prio</b>, <b>modular-random-prefetching</b>, <b>modular-random-prio-prefetching</b> are random-based Schedulers (without/with prefetching) : \n
			
 
				+Select randomly a resource to be mapped on for each task.
			
 
				+
			
 
				+- <b>modular-ws</b>) implements Work Stealing:
			
 
				+Maps tasks to workers in round robin, but allows workers to steal work from other workers.
			
 
				+
			
 
				+- <b>modular-heft</b>, <b>modular-heft2</b>, and <b>modular-heft-prio</b> are
			
 
				+HEFT Schedulers : \n
			
 
				+Maps tasks to workers using a heuristic very close to
			
 
				+Heterogeneous Earliest Finish Time.
			
 
				+It needs that every task submitted to StarPU have a
			
 
				+defined performance model (\ref PerformanceModelCalibration)
			
 
				+to work efficiently, but can handle tasks without a performance
			
 
				+model. <b>modular-heft</b> just takes tasks by priority order. <b>modular-heft2</b> takes
			
 
				+at most 5 tasks of the same priority and checks which one fits best.
			
 
				+<b>modular-heft-prio</b> is similar to <b>modular-heft</b>, but only decides the memory
			
 
				+node, not the exact worker, just pushing tasks to one central queue per memory
			
 
				+node.
			
 
				+
			
 
				+- <b>modular-heteroprio</b> is a Heteroprio Scheduler: \n
			
 
				+Maps tasks to worker similarly to HEFT, but first attribute accelerated tasks to
			
 
				+GPUs, then not-so-accelerated tasks to CPUs.
			
 
				 
			
 
				 \section TaskDistributionVsDataTransfer Task Distribution Vs Data Transfer
			
 
				 
			
@@ -198,51 +235,6 @@ use starpu_task_expected_length() on the task (in µs), multiplied by the
 
				 typical power consumption of the device, e.g. in W, and divided by 1000000. to
			
 
				 get Joules.
			
 
				 
			
 
				-\section ExistingModularizedSchedulers Modularized Schedulers
			
 
				-
			
 
				-StarPU provides a powerful way to implement schedulers, as documented in \ref
			
 
				-DefiningANewModularSchedulingPolicy . It is currently shipped with the following
			
 
				-pre-defined Modularized Schedulers :
			
 
				-
			
 
				-- Eager-based Schedulers (with/without prefetching : \c modular-eager ,
			
 
				-\c modular-eager-prefetching) : \n
			
 
				-Naive scheduler, which tries to map a task on the first available resource
			
 
				-it finds. The prefecthing variant queues several tasks in advance to be able to
			
 
				-do data prefetching. This may however degrade load balancing a bit.
			
 
				-
			
 
				-- Prio-based Schedulers (with/without prefetching :
			
 
				-\c modular-prio, \c modular-prio-prefetching , \c modular-eager-prio) : \n
			
 
				-Similar to Eager-Based Schedulers. Can handle tasks which have a defined
			
 
				-priority and schedule them accordingly.
			
 
				-The \c modular-eager-prio variant integrates the eager and priority queue in a
			
 
				-single component. This allows it to do a better job at pushing tasks.
			
 
				-
			
 
				-- Random-based Schedulers (with/without prefetching: \c modular-random,
			
 
				-\c modular-random-prio, \c modular-random-prefetching, \c
			
 
				-modular-random-prio-prefetching) : \n
			
 
				-Selects randomly a resource to be mapped on for each task.
			
 
				-
			
 
				-- Work Stealing (\c modular-ws) : \n
			
 
				-Maps tasks to workers in round robin, but allows workers to steal work from other workers.
			
 
				-
			
 
				-- HEFT Scheduler : \n
			
 
				-Maps tasks to workers using a heuristic very close to
			
 
				-Heterogeneous Earliest Finish Time.
			
 
				-It needs that every task submitted to StarPU have a
			
 
				-defined performance model (\ref PerformanceModelCalibration)
			
 
				-to work efficiently, but can handle tasks without a performance
			
 
				-model. \c modular-heft just takes tasks by priority order. \c modular-heft takes
			
 
				-at most 5 tasks of the same priority and checks which one fits best. \c
			
 
				-modular-heft-prio is similar to \c modular-heft, but only decides the memory
			
 
				-node, not the exact worker, just pushing tasks to one central queue per memory
			
 
				-node.
			
 
				-
			
 
				-- Heteroprio Scheduler: \n
			
 
				-Maps tasks to worker similarly to HEFT, but first attribute accelerated tasks to
			
 
				-GPUs, then not-so-accelerated tasks to CPUs.
			
 
				-
			
 
				-To use one of these schedulers, one can set the environment variable \ref STARPU_SCHED.
			
 
				-
			
 
				 \section StaticScheduling Static Scheduling
			
 
				 
			
 
				 In some cases, one may want to force some scheduling, for instance force a given
			
--- a/doc/doxygen/chapters/410_mpi_support.doxy
+++ b/doc/doxygen/chapters/410_mpi_support.doxy
@@ -34,6 +34,22 @@ An MPI Insert Task function provides an even more seamless transition to a
 
				 distributed application, by automatically issuing all required data transfers
			
 
				 according to the task graph and an application-provided distribution.
			
 
				 
			
 
				+\section MPIBuild Building with MPI support
			
 
				+
			
 
				+If a <c>mpicc</c> compiler is already in your PATH, StarPU will automatically
			
 
				+enable MPI support in the build. If <c>mpicc</c> is not in PATH, you
			
 
				+can specify its location by passing <c>--with-mpicc=/where/there/is/mpicc</c> to
			
 
				+<c>./configure</c>
			
 
				+
			
 
				+It can be useful to enable MPI tests during <c>make check</c> by passing
			
 
				+<c>--enable-mpi-check</c> to <c>./configure</c>. And similarly to
			
 
				+<c>mpicc</c>, if <c>mpiexec</c> in not in PATH, you can specify its location by passing
			
 
				+<c>--with-mpiexec=/where/there/is/mpiexec</c> to <c>./configure</c>, but this is
			
 
				+not needed if it is next to <c>mpicc</c>, configure will look there in addition to PATH.
			
 
				+
			
 
				+Similarly, Fortran examples use <c>mpif90</c>, which can be specified manually
			
 
				+with <c>--with-mpifort</c> if it can't be found automatically.
			
 
				+
			
 
				 \section ExampleDocumentation Example Used In This Documentation
			
 
				 
			
 
				 The example below will be used as the base for this documentation. It
			
--- a/doc/doxygen/chapters/501_environment_variables.doxy
+++ b/doc/doxygen/chapters/501_environment_variables.doxy
@@ -986,6 +986,23 @@ NUMA nodes used by StarPU. Any \ref STARPU_LIMIT_CPU_NUMA_devid_MEM additionally
 
				 specified will take over STARPU_LIMIT_CPU_NUMA_MEM.
			
 
				 </dd>
			
 
				 
			
 
				+<dt>STARPU_LIMIT_BANDWIDTH</dt>
			
 
				+<dd>
			
 
				+\anchor STARPU_LIMIT_BANDWIDTH
			
 
				+\addindex __env__STARPU_LIMIT_BANDWIDTH
			
 
				+Specify the maximum available PCI bandwidth of the system in MB/s. This can only
			
 
				+be effective with simgrid simulation. This allows to easily override the
			
 
				+bandwidths stored in the platform file generated from measurements on the native
			
 
				+system. This can be used e.g. for convenient
			
 
				+
			
 
				+Specify the maximum number of megabytes that should be available to the
			
 
				+application on each NUMA node. This is the same as specifying that same amount
			
 
				+with \ref STARPU_LIMIT_CPU_NUMA_devid_MEM for each NUMA node number. The total
			
 
				+memory available to StarPU will thus be this amount multiplied by the number of
			
 
				+NUMA nodes used by StarPU. Any \ref STARPU_LIMIT_CPU_NUMA_devid_MEM additionally
			
 
				+specified will take over STARPU_LIMIT_BANDWIDTH.
			
 
				+</dd>
			
 
				+
			
 
				 <dt>STARPU_MINIMUM_AVAILABLE_MEM</dt>
			
 
				 <dd>
			
 
				 \anchor STARPU_MINIMUM_AVAILABLE_MEM
			
--- a/examples/mult/xgemm.c
+++ b/examples/mult/xgemm.c
@@ -334,7 +334,7 @@ static void parse_args(int argc, char **argv)
 
				 		}
			
 
				 		else
			
 
				 		{
			
 
				-			fprintf(stderr,"Unrecognized option %s", argv[i]);
			
 
				+			fprintf(stderr,"Unrecognized option %s\n", argv[i]);
			
 
				 			exit(EXIT_FAILURE);
			
 
				 		}
			
 
				 	}
			
--- a/include/starpu_bitmap.h
+++ b/include/starpu_bitmap.h
@@ -18,6 +18,12 @@
 
				 #ifndef __STARPU_BITMAP_H__
			
 
				 #define __STARPU_BITMAP_H__
			
 
				 
			
 
				+#include <starpu_util.h>
			
 
				+#include <starpu_config.h>
			
 
				+
			
 
				+#include <string.h>
			
 
				+#include <stdlib.h>
			
 
				+
			
 
				 #ifdef __cplusplus
			
 
				 extern "C"
			
 
				 {
			
@@ -28,43 +34,268 @@ extern "C"
 
				    @brief This is the interface for the bitmap utilities provided by StarPU.
			
 
				    @{
			
 
				  */
			
 
				+#ifndef _STARPU_LONG_BIT
			
 
				+#define _STARPU_LONG_BIT ((int)(sizeof(unsigned long) * 8))
			
 
				+#endif
			
 
				+
			
 
				+#define _STARPU_BITMAP_SIZE ((STARPU_NMAXWORKERS - 1)/_STARPU_LONG_BIT) + 1
			
 
				 
			
 
				 /** create a empty starpu_bitmap */
			
 
				-struct starpu_bitmap *starpu_bitmap_create(void) STARPU_ATTRIBUTE_MALLOC;
			
 
				+static inline struct starpu_bitmap *starpu_bitmap_create(void) STARPU_ATTRIBUTE_MALLOC;
			
 
				+/** zero a starpu_bitmap */
			
 
				+static inline void starpu_bitmap_init(struct starpu_bitmap *b);
			
 
				 /** free \p b */
			
 
				-void starpu_bitmap_destroy(struct starpu_bitmap *b);
			
 
				+static inline void starpu_bitmap_destroy(struct starpu_bitmap *b);
			
 
				 
			
 
				 /** set bit \p e in \p b */
			
 
				-void starpu_bitmap_set(struct starpu_bitmap *b, int e);
			
 
				+static inline void starpu_bitmap_set(struct starpu_bitmap *b, int e);
			
 
				 /** unset bit \p e in \p b */
			
 
				-void starpu_bitmap_unset(struct starpu_bitmap *b, int e);
			
 
				+static inline void starpu_bitmap_unset(struct starpu_bitmap *b, int e);
			
 
				 /** unset all bits in \p b */
			
 
				-void starpu_bitmap_unset_all(struct starpu_bitmap *b);
			
 
				+static inline void starpu_bitmap_unset_all(struct starpu_bitmap *b);
			
 
				 
			
 
				 /** return true iff bit \p e is set in \p b */
			
 
				-int starpu_bitmap_get(struct starpu_bitmap *b, int e);
			
 
				+static inline int starpu_bitmap_get(struct starpu_bitmap *b, int e);
			
 
				 /** Basically compute \c starpu_bitmap_unset_all(\p a) ; \p a = \p b & \p c; */
			
 
				-void starpu_bitmap_unset_and(struct starpu_bitmap *a, struct starpu_bitmap *b, struct starpu_bitmap *c);
			
 
				+static inline void starpu_bitmap_unset_and(struct starpu_bitmap *a, struct starpu_bitmap *b, struct starpu_bitmap *c);
			
 
				 /** Basically compute \p a |= \p b */
			
 
				-void starpu_bitmap_or(struct starpu_bitmap *a, struct starpu_bitmap *b);
			
 
				+static inline void starpu_bitmap_or(struct starpu_bitmap *a, struct starpu_bitmap *b);
			
 
				 /** return 1 iff \p e is set in \p b1 AND \p e is set in \p b2 */
			
 
				-int starpu_bitmap_and_get(struct starpu_bitmap *b1, struct starpu_bitmap *b2, int e);
			
 
				+static inline int starpu_bitmap_and_get(struct starpu_bitmap *b1, struct starpu_bitmap *b2, int e);
			
 
				 /** return the number of set bits in \p b */
			
 
				-int starpu_bitmap_cardinal(struct starpu_bitmap *b);
			
 
				+static inline int starpu_bitmap_cardinal(struct starpu_bitmap *b);
			
 
				 
			
 
				 /** return the index of the first set bit of \p b, -1 if none */
			
 
				-int starpu_bitmap_first(struct starpu_bitmap *b);
			
 
				+static inline int starpu_bitmap_first(struct starpu_bitmap *b);
			
 
				 /** return the position of the last set bit of \p b, -1 if none */
			
 
				-int starpu_bitmap_last(struct starpu_bitmap *b);
			
 
				+static inline int starpu_bitmap_last(struct starpu_bitmap *b);
			
 
				 /** return the position of set bit right after \p e in \p b, -1 if none */
			
 
				-int starpu_bitmap_next(struct starpu_bitmap *b, int e);
			
 
				+static inline int starpu_bitmap_next(struct starpu_bitmap *b, int e);
			
 
				 /** todo */
			
 
				-int starpu_bitmap_has_next(struct starpu_bitmap *b, int e);
			
 
				+static inline int starpu_bitmap_has_next(struct starpu_bitmap *b, int e);
			
 
				 
			
 
				 /** @} */
			
 
				 
			
 
				-#ifdef __cplusplus
			
 
				+struct starpu_bitmap
			
 
				+{
			
 
				+	unsigned long bits[_STARPU_BITMAP_SIZE];
			
 
				+	int cardinal;
			
 
				+};
			
 
				+
			
 
				+#ifdef _STARPU_DEBUG_BITMAP
			
 
				+static int _starpu_check_bitmap(struct starpu_bitmap *b)
			
 
				+{
			
 
				+	int card = b->cardinal;
			
 
				+	int i = starpu_bitmap_first(b);
			
 
				+	int j;
			
 
				+	for(j = 0; j < card; j++)
			
 
				+	{
			
 
				+		if(i == -1)
			
 
				+			return 0;
			
 
				+		int tmp = starpu_bitmap_next(b,i);
			
 
				+		if(tmp == i)
			
 
				+			return 0;
			
 
				+		i = tmp;
			
 
				+	}
			
 
				+	if(i != -1)
			
 
				+		return 0;
			
 
				+	return 1;
			
 
				 }
			
 
				+#else
			
 
				+#define _starpu_check_bitmap(b) 1
			
 
				 #endif
			
 
				 
			
 
				+static int _starpu_count_bit_static(unsigned long e)
			
 
				+{
			
 
				+#if (__GNUC__ >= 4) || ((__GNUC__ == 3) && (__GNUC_MINOR__) >= 4)
			
 
				+	return __builtin_popcountl(e);
			
 
				+#else
			
 
				+	int c = 0;
			
 
				+	while(e)
			
 
				+	{
			
 
				+		c += e&1;
			
 
				+		e >>= 1;
			
 
				+	}
			
 
				+	return c;
			
 
				 #endif
			
 
				+}
			
 
				+
			
 
				+static inline struct starpu_bitmap *starpu_bitmap_create()
			
 
				+{
			
 
				+	return (struct starpu_bitmap *) calloc(1, sizeof(struct starpu_bitmap));
			
 
				+}
			
 
				+
			
 
				+static inline void starpu_bitmap_init(struct starpu_bitmap *b)
			
 
				+{
			
 
				+	memset(b, 0, sizeof(*b));
			
 
				+}
			
 
				+
			
 
				+static inline void starpu_bitmap_destroy(struct starpu_bitmap * b)
			
 
				+{
			
 
				+	free(b);
			
 
				+}
			
 
				+
			
 
				+static inline void starpu_bitmap_set(struct starpu_bitmap * b, int e)
			
 
				+{
			
 
				+	if(!starpu_bitmap_get(b, e))
			
 
				+		b->cardinal++;
			
 
				+	else
			
 
				+		return;
			
 
				+	STARPU_ASSERT(e/_STARPU_LONG_BIT < _STARPU_BITMAP_SIZE);
			
 
				+	b->bits[e/_STARPU_LONG_BIT] |= (1ul << (e%_STARPU_LONG_BIT));
			
 
				+	STARPU_ASSERT(_starpu_check_bitmap(b));
			
 
				+}
			
 
				+static inline void starpu_bitmap_unset(struct starpu_bitmap *b, int e)
			
 
				+{
			
 
				+	if(starpu_bitmap_get(b, e))
			
 
				+		b->cardinal--;
			
 
				+	else
			
 
				+		return;
			
 
				+	STARPU_ASSERT(e/_STARPU_LONG_BIT < _STARPU_BITMAP_SIZE);
			
 
				+	if(e / _STARPU_LONG_BIT > _STARPU_BITMAP_SIZE)
			
 
				+		return;
			
 
				+	b->bits[e/_STARPU_LONG_BIT] &= ~(1ul << (e%_STARPU_LONG_BIT));
			
 
				+	STARPU_ASSERT(_starpu_check_bitmap(b));
			
 
				+}
			
 
				+
			
 
				+static inline void starpu_bitmap_unset_all(struct starpu_bitmap * b)
			
 
				+{
			
 
				+	memset(b->bits, 0, _STARPU_BITMAP_SIZE * sizeof(unsigned long));
			
 
				+}
			
 
				+
			
 
				+static inline void starpu_bitmap_unset_and(struct starpu_bitmap * a, struct starpu_bitmap * b, struct starpu_bitmap * c)
			
 
				+{
			
 
				+	a->cardinal = 0;
			
 
				+	int i;
			
 
				+	for(i = 0; i < _STARPU_BITMAP_SIZE; i++)
			
 
				+	{
			
 
				+		a->bits[i] = b->bits[i] & c->bits[i];
			
 
				+		a->cardinal += _starpu_count_bit_static(a->bits[i]);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static inline int starpu_bitmap_get(struct starpu_bitmap * b, int e)
			
 
				+{
			
 
				+	STARPU_ASSERT(e / _STARPU_LONG_BIT < _STARPU_BITMAP_SIZE);
			
 
				+	if(e / _STARPU_LONG_BIT >= _STARPU_BITMAP_SIZE)
			
 
				+		return 0;
			
 
				+	return (b->bits[e/_STARPU_LONG_BIT] & (1ul << (e%_STARPU_LONG_BIT))) ?
			
 
				+		1:
			
 
				+		0;
			
 
				+}
			
 
				+
			
 
				+static inline void starpu_bitmap_or(struct starpu_bitmap * a, struct starpu_bitmap * b)
			
 
				+{
			
 
				+	int i;
			
 
				+	a->cardinal = 0;
			
 
				+	for(i = 0; i < _STARPU_BITMAP_SIZE; i++)
			
 
				+	{
			
 
				+		a->bits[i] |= b->bits[i];
			
 
				+		a->cardinal += _starpu_count_bit_static(a->bits[i]);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+
			
 
				+static inline int starpu_bitmap_and_get(struct starpu_bitmap * b1, struct starpu_bitmap * b2, int e)
			
 
				+{
			
 
				+	return starpu_bitmap_get(b1,e) && starpu_bitmap_get(b2,e);
			
 
				+}
			
 
				+
			
 
				+static inline int starpu_bitmap_cardinal(struct starpu_bitmap * b)
			
 
				+{
			
 
				+	return b->cardinal;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+static inline int _starpu_get_first_bit_rank(unsigned long ms)
			
 
				+{
			
 
				+	STARPU_ASSERT(ms != 0);
			
 
				+#if (__GNUC__ >= 4) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))
			
 
				+	return __builtin_ffsl(ms) - 1;
			
 
				+#else
			
 
				+	unsigned long m = 1ul;
			
 
				+	int i = 0;
			
 
				+	while(!(m&ms))
			
 
				+		i++,m<<=1;
			
 
				+	return i;
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+static inline int _starpu_get_last_bit_rank(unsigned long l)
			
 
				+{
			
 
				+	STARPU_ASSERT(l != 0);
			
 
				+#if (__GNUC__ >= 4) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))
			
 
				+	return 8*sizeof(l) - __builtin_clzl(l);
			
 
				+#else
			
 
				+	int ibit = _STARPU_LONG_BIT - 1;
			
 
				+	while((!(1ul << ibit)) & l)
			
 
				+		ibit--;
			
 
				+	STARPU_ASSERT(ibit >= 0);
			
 
				+	return ibit;
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+static inline int starpu_bitmap_first(struct starpu_bitmap * b)
			
 
				+{
			
 
				+	int i = 0;
			
 
				+	while(i < _STARPU_BITMAP_SIZE && !b->bits[i])
			
 
				+		i++;
			
 
				+	if( i == _STARPU_BITMAP_SIZE)
			
 
				+		return -1;
			
 
				+	int nb_long = i;
			
 
				+	unsigned long ms = b->bits[i];
			
 
				+
			
 
				+	return (nb_long * _STARPU_LONG_BIT) + _starpu_get_first_bit_rank(ms);
			
 
				+}
			
 
				+
			
 
				+static inline int starpu_bitmap_has_next(struct starpu_bitmap * b, int e)
			
 
				+{
			
 
				+	int nb_long = (e+1) / _STARPU_LONG_BIT;
			
 
				+	int nb_bit = (e+1) % _STARPU_LONG_BIT;
			
 
				+	unsigned long mask = (~0ul) << nb_bit;
			
 
				+	if(b->bits[nb_long] & mask)
			
 
				+		return 1;
			
 
				+	for(nb_long++; nb_long < _STARPU_BITMAP_SIZE; nb_long++)
			
 
				+		if(b->bits[nb_long])
			
 
				+			return 1;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static inline int starpu_bitmap_last(struct starpu_bitmap * b)
			
 
				+{
			
 
				+	if(b->cardinal == 0)
			
 
				+		return -1;
			
 
				+	int ilong;
			
 
				+	for(ilong = _STARPU_BITMAP_SIZE - 1; ilong >= 0; ilong--)
			
 
				+	{
			
 
				+		if(b->bits[ilong])
			
 
				+			break;
			
 
				+	}
			
 
				+	STARPU_ASSERT(ilong >= 0);
			
 
				+	unsigned long l = b->bits[ilong];
			
 
				+	return ilong * _STARPU_LONG_BIT + _starpu_get_last_bit_rank(l);
			
 
				+}
			
 
				+
			
 
				+static inline int starpu_bitmap_next(struct starpu_bitmap *b, int e)
			
 
				+{
			
 
				+	int nb_long = e / _STARPU_LONG_BIT;
			
 
				+	int nb_bit = e % _STARPU_LONG_BIT;
			
 
				+	unsigned long rest = nb_bit == _STARPU_LONG_BIT - 1 ? 0 : (~0ul << (nb_bit + 1)) & b->bits[nb_long];
			
 
				+	if(nb_bit != (_STARPU_LONG_BIT - 1) && rest)
			
 
				+	{
			
 
				+		int i = _starpu_get_first_bit_rank(rest);
			
 
				+		STARPU_ASSERT(i >= 0 && i < _STARPU_LONG_BIT);
			
 
				+		return (nb_long * _STARPU_LONG_BIT) + i;
			
 
				+	}
			
 
				+
			
 
				+	for(nb_long++;nb_long < _STARPU_BITMAP_SIZE; nb_long++)
			
 
				+		if(b->bits[nb_long])
			
 
				+			return nb_long * _STARPU_LONG_BIT + _starpu_get_first_bit_rank(b->bits[nb_long]);
			
 
				+	return -1;
			
 
				+}
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#endif /* __STARPU_BITMAP_H__ */
			
--- a/include/starpu_sched_component.h
+++ b/include/starpu_sched_component.h
@@ -69,14 +69,14 @@ struct starpu_sched_component
 
				 	/** The tree containing the component*/
			
 
				 	struct starpu_sched_tree *tree;
			
 
				 	/** set of underlying workers */
			
 
				-	struct starpu_bitmap *workers;
			
 
				+	struct starpu_bitmap workers;
			
 
				 	/**
			
 
				 	   subset of starpu_sched_component::workers that is currently available in the context
			
 
				 	   The push method should take this value into account, it is set with:
			
 
				 	   component->workers UNION tree->workers UNION
			
 
				 	   component->child[i]->workers_in_ctx iff exist x such as component->children[i]->parents[x] == component
			
 
				 	*/
			
 
				-	struct starpu_bitmap *workers_in_ctx;
			
 
				+	struct starpu_bitmap workers_in_ctx;
			
 
				 	/** private data */
			
 
				 	void *data;
			
 
				 	char *name;
			
@@ -188,7 +188,7 @@ struct starpu_sched_tree
 
				 	/**
			
 
				 	   set of workers available in this context, this value is used to mask workers in modules
			
 
				 	*/
			
 
				-	struct starpu_bitmap *workers;
			
 
				+	struct starpu_bitmap workers;
			
 
				 	/**
			
 
				 	   context id of the scheduler
			
 
				 	*/
			
--- a/julia/StarPU.jl/Makefile
+++ b/julia/StarPU.jl/Makefile
--- a/julia/StarPU.jl/Manifest.toml
+++ b/julia/StarPU.jl/Manifest.toml
--- a/julia/StarPU.jl/Project.toml
+++ b/julia/StarPU.jl/Project.toml
--- a/julia/README
+++ b/julia/README
@@ -0,0 +1,53 @@
 
				+Contents
			
 
				+========
			
 
				+
			
 
				+* Installing Julia
			
 
				+* Installing StarPU module for Julia
			
 
				+* Running Examples
			
 
				+
			
 
				+Installing Julia
			
 
				+----------------
			
 
				+Julia version 1.3+ is required and can be downloaded from
			
 
				+https://julialang.org/downloads/.
			
 
				+
			
 
				+
			
 
				+Installing StarPU module for Julia
			
 
				+----------------------------------
			
 
				+First, build the jlstarpu_c_wrapper library:
			
 
				+
			
 
				+$ make
			
 
				+
			
 
				+Then, you need to add the lib/ directory to your library path and the julia/
			
 
				+directory to your Julia load path:
			
 
				+
			
 
				+$ export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$PWD/lib
			
 
				+$ export JULIA_LOAD_PATH=$JULIA_LOAD_PATH:$PWD
			
 
				+
			
 
				+This step can also be done by sourcing the setenv.sh script:
			
 
				+
			
 
				+$ . setenv.sh
			
 
				+
			
 
				+Running Examples
			
 
				+----------------
			
 
				+
			
 
				+You can find several examples in the examples/ directory.
			
 
				+
			
 
				+For each example X, three versions are provided:
			
 
				+
			
 
				+- X.c: Original C+starpu code
			
 
				+- X_native.jl: Native Julia version (without StarPU)
			
 
				+- X.jl: Julia version using StarPU
			
 
				+
			
 
				+
			
 
				+To run the original C+StarPU code:
			
 
				+$ make cstarpu.dat
			
 
				+
			
 
				+To run the native Julia version:
			
 
				+$ make julia_native.dat
			
 
				+
			
 
				+To run the Julia version using StarPU:
			
 
				+$ make julia_generatedc.dat
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
--- a/julia/StarPU.jl/REQUIRE
+++ b/julia/StarPU.jl/REQUIRE
--- a/julia/examples/black_scholes/black_scholes.c
+++ b/julia/examples/black_scholes/black_scholes.c
--- a/julia/black_scholes/black_scholes.jl
+++ b/julia/black_scholes/black_scholes.jl
@@ -115,8 +115,6 @@ using StarPU
 
				     return 0
			
 
				 end
			
 
				 
			
 
				-
			
 
				-@debugprint "starpu_init"
			
 
				 starpu_init()
			
 
				 
			
 
				 function black_scholes_starpu(data ::Matrix{Float64}, res ::Matrix{Float64}, nslices ::Int64)
			
--- a/julia/examples/mandelbrot/Makefile
+++ b/julia/examples/mandelbrot/Makefile
@@ -0,0 +1,58 @@
 
				+CC=gcc
			
 
				+NVCC=nvcc
			
 
				+ENABLE_CUDA=no
			
 
				+LD=$(CC)
			
 
				+
			
 
				+ifeq ($(ENABLE_CUDA),yes)
			
 
				+        LD := ${NVCC}
			
 
				+endif
			
 
				+
			
 
				+CFLAGS = -O3 -g $(shell pkg-config --cflags starpu-1.3)
			
 
				+CPU_CFLAGS = ${CFLAGS} -Wall -mavx -fomit-frame-pointer -march=native -ffast-math
			
 
				+CUDA_CFLAGS = ${CFLAGS}
			
 
				+LDFLAGS +=$(shell pkg-config --libs starpu-1.3)
			
 
				+
			
 
				+EXTERNLIB=extern_tasks.so
			
 
				+GENERATEDLIB=generated_tasks.so
			
 
				+
			
 
				+C_OBJECTS=$(patsubst %.c,%.o,$(wildcard gen*.c))
			
 
				+CUDA_OBJECTS=$(patsubst %.cu,%.o,$(wildcard gen*.cu))
			
 
				+ifneq ($(ENABLE_CUDA),yes)
			
 
				+	CUDA_OBJECTS:=
			
 
				+endif
			
 
				+
			
 
				+LIBPATH=${PWD}/../StarPU.jl/lib
			
 
				+
			
 
				+all: ${EXTERNLIB}
			
 
				+
			
 
				+mandelbrot: mandelbrot.c cpu_mandelbrot.o #gpu_mandelbrot.o
			
 
				+	$(CC) $(CPU_CFLAGS) $^ -o $@ $(LDFLAGS) -lm
			
 
				+
			
 
				+%.o: %.c
			
 
				+	$(CC) -c -fPIC $(CPU_CFLAGS) $^ -o $@
			
 
				+
			
 
				+%.o: %.cu
			
 
				+	$(NVCC) -dc $(CUDA_CFLAGS) $^ --shared --compiler-options '-fPIC' -o $@ $(LDFLAGS)
			
 
				+
			
 
				+${EXTERNLIB}: cpu_mandelbrot.c
			
 
				+	$(CC) $(CFLAGS) -shared -fPIC $(LDFLAGS) $^ -o $@
			
 
				+
			
 
				+${GENERATEDLIB}: $(C_OBJECTS) $(CUDA_OBJECTS)
			
 
				+	$(LD) -shared $(LDFLAGS) $^ -o $@
			
 
				+
			
 
				+.PHONY: clean
			
 
				+
			
 
				+clean:
			
 
				+	rm -f mandelbrot *.so *.o genc_*.c gencuda_*.cu *.dat
			
 
				+
			
 
				+# Performance Tests
			
 
				+cstarpu.dat: mandelbrot
			
 
				+	STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 ./mandelbrot -0.800671 -0.158392 32 32 4096 4 > $@
			
 
				+julia_generatedc.dat:
			
 
				+	LD_LIBRARY_PATH+=${LIBPATH} STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 julia mandelbrot.jl $@
			
 
				+julia_native.dat:
			
 
				+	LD_LIBRARY_PATH+=${LIBPATH} STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 julia mandelbrot_native.jl $@
			
 
				+julia_calllib.dat: ${EXTERNLIB}
			
 
				+	LD_LIBRARY_PATH+=${LIBPATH} JULIA_TASK_LIB="${EXTERNLIB}" STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 julia mandelbrot.jl julia_calllib.dat
			
 
				+
			
 
				+test: cstarpu.dat julia_generatedc.dat julia_native.dat julia_calllib.dat
			
--- a/julia/examples/mandelbrot/cpu_mandelbrot.c
+++ b/julia/examples/mandelbrot/cpu_mandelbrot.c
@@ -0,0 +1,60 @@
 
				+#include <stdio.h>
			
 
				+#include <starpu.h>
			
 
				+#include <math.h>
			
 
				+
			
 
				+void cpu_mandelbrot(void *descr[], void *cl_arg)
			
 
				+{
			
 
				+        long long *pixels;
			
 
				+        float *params;
			
 
				+
			
 
				+        pixels = (long long int *)STARPU_MATRIX_GET_PTR(descr[0]);
			
 
				+        params = (float *)STARPU_MATRIX_GET_PTR(descr[1]);
			
 
				+
			
 
				+        long long width = STARPU_MATRIX_GET_NY(descr[0]);
			
 
				+        long long height = STARPU_MATRIX_GET_NX(descr[0]);
			
 
				+        double zoom = width * 0.25296875;
			
 
				+        double iz = 1. / zoom;
			
 
				+        float diverge = 4.0;
			
 
				+        float max_iterations = (width/2) * 0.049715909 * log10(zoom);
			
 
				+        float imi = 1. / max_iterations;
			
 
				+        float centerr = params[0];
			
 
				+        float centeri = params[1];
			
 
				+        float offset = params[2];
			
 
				+        float dim = params[3];
			
 
				+        double cr = 0;
			
 
				+        double zr = 0;
			
 
				+        double ci = 0;
			
 
				+        double zi = 0;
			
 
				+        long long n = 0;
			
 
				+        double tmp = 0;
			
 
				+        int ldP = STARPU_MATRIX_GET_LD(descr[0]);
			
 
				+
			
 
				+        long long x,y;
			
 
				+
			
 
				+        for (y = 0; y < height; y++){
			
 
				+                for (x = 0; x < width; x++){
			
 
				+                        cr = centerr + (x - (dim/2)) * iz;
			
 
				+			zr = cr;
			
 
				+                        ci = centeri + (y+offset - (dim/2)) * iz;
			
 
				+                        zi = ci;
			
 
				+
			
 
				+                        for (n = 0; n <= max_iterations; n++) {
			
 
				+				if (zr*zr + zi*zi>diverge) break;
			
 
				+                                tmp = zr*zr - zi*zi + cr;
			
 
				+                                zi = 2*zr*zi + ci;
			
 
				+                                zr = tmp;
			
 
				+                        }
			
 
				+			if (n<max_iterations)
			
 
				+				pixels[y +x*ldP] = round(15.*n*imi);
			
 
				+			else
			
 
				+				pixels[y +x*ldP] = 0;
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+char* CPU = "cpu_mandelbrot";
			
 
				+char* GPU = "gpu_mandelbrot";
			
 
				+extern char *starpu_find_function(char *name, char *device) {
			
 
				+	if (!strcmp(device,"gpu")) return GPU;
			
 
				+	return CPU;
			
 
				+}
			
--- a/julia/examples/mandelbrot/mandelbrot.c
+++ b/julia/examples/mandelbrot/mandelbrot.c
@@ -0,0 +1,166 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2019       Mael Keryell
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+#include <stdio.h>
			
 
				+#include <stdlib.h>
			
 
				+#include <starpu.h>
			
 
				+
			
 
				+void cpu_mandelbrot(void **, void *);
			
 
				+void gpu_mandelbrot(void **, void *);
			
 
				+
			
 
				+static struct starpu_perfmodel model =
			
 
				+{
			
 
				+		.type = STARPU_HISTORY_BASED,
			
 
				+		.symbol = "history_perf"
			
 
				+};
			
 
				+
			
 
				+static struct starpu_codelet cl =
			
 
				+{
			
 
				+	.cpu_funcs = {cpu_mandelbrot},
			
 
				+	//.cuda_funcs = {gpu_mandelbrot},
			
 
				+	.nbuffers = 2,
			
 
				+	.modes = {STARPU_W, STARPU_R},
			
 
				+	.model = &model
			
 
				+};
			
 
				+
			
 
				+
			
 
				+void mandelbrot_with_starpu(long long *pixels, float *params, long long dim, long long nslicesx)
			
 
				+{
			
 
				+	starpu_data_handle_t pixels_handle;
			
 
				+	starpu_data_handle_t params_handle;
			
 
				+
			
 
				+	starpu_matrix_data_register(&pixels_handle, STARPU_MAIN_RAM, (uintptr_t)pixels, dim, dim, dim, sizeof(long long));
			
 
				+	starpu_matrix_data_register(&params_handle, STARPU_MAIN_RAM, (uintptr_t)params, 4*nslicesx, 4*nslicesx, 1, sizeof(float));
			
 
				+
			
 
				+	struct starpu_data_filter horiz =
			
 
				+	{
			
 
				+		.filter_func = starpu_matrix_filter_block,
			
 
				+		.nchildren = nslicesx
			
 
				+	};
			
 
				+
			
 
				+	starpu_data_partition(pixels_handle, &horiz);
			
 
				+	starpu_data_partition(params_handle, &horiz);
			
 
				+
			
 
				+	long long taskx;
			
 
				+
			
 
				+	for (taskx = 0; taskx < nslicesx; taskx++){
			
 
				+		struct starpu_task *task = starpu_task_create();
			
 
				+
			
 
				+		task->cl = &cl;
			
 
				+		task->handles[0] = starpu_data_get_child(pixels_handle, taskx);
			
 
				+		task->handles[1] = starpu_data_get_child(params_handle, taskx);
			
 
				+		if (starpu_task_submit(task)!=0) fprintf(stderr,"submit task error\n");
			
 
				+	}
			
 
				+
			
 
				+	starpu_task_wait_for_all();
			
 
				+
			
 
				+	starpu_data_unpartition(pixels_handle, STARPU_MAIN_RAM);
			
 
				+	starpu_data_unpartition(params_handle, STARPU_MAIN_RAM);
			
 
				+
			
 
				+	starpu_data_unregister(pixels_handle);
			
 
				+	starpu_data_unregister(params_handle);
			
 
				+}
			
 
				+
			
 
				+void pixels2img(long long *pixels, long long width, long long height, const char *filename)
			
 
				+{
			
 
				+  FILE *fp = fopen(filename, "w");
			
 
				+  if (!fp)
			
 
				+    return;
			
 
				+
			
 
				+  int MAPPING[16][3] = {{66,30,15},{25,7,26},{9,1,47},{4,4,73},{0,7,100},{12,44,138},{24,82,177},{57,125,209},{134,181,229},{211,236,248},{241,233,191},{248,201,95},{255,170,0},{204,128,0},{153,87,0},{106,52,3}};
			
 
				+
			
 
				+  fprintf(fp, "P3\n%lld %lld\n255\n", width, height);
			
 
				+  long long i, j;
			
 
				+  for (i = 0; i < height; ++i) {
			
 
				+    for (j = 0; j < width; ++j) {
			
 
				+      fprintf(fp, "%d %d %d ", MAPPING[pixels[j*width+i]][0], MAPPING[pixels[j*width+i]][1], MAPPING[pixels[j*width+i]][2]);
			
 
				+    }
			
 
				+  }
			
 
				+
			
 
				+  fclose(fp);
			
 
				+}
			
 
				+
			
 
				+double min_times(double cr, double ci, long long dim, long long nslices)
			
 
				+{
			
 
				+	long long *pixels = calloc(dim*dim, sizeof(long long));
			
 
				+	float *params = calloc(4*nslices, sizeof(float));
			
 
				+
			
 
				+	double t_min = 0;
			
 
				+	long long i;
			
 
				+
			
 
				+	for (i=0; i<nslices; i++) {
			
 
				+		params[4*i+0] = cr;
			
 
				+		params[4*i+1] = ci;
			
 
				+		params[4*i+2] = i*dim/nslices;
			
 
				+		params[4*i+3] = dim;
			
 
				+	}
			
 
				+
			
 
				+	double start, stop, exec_t;
			
 
				+	for (i = 0; i < 10; i++){
			
 
				+		start = starpu_timing_now(); // starpu_timing_now() gives the time in microseconds.
			
 
				+		mandelbrot_with_starpu(pixels, params, dim, nslices);
			
 
				+		stop = starpu_timing_now();
			
 
				+		exec_t = (stop-start)*1.e3;
			
 
				+		if (t_min==0 || t_min>exec_t)
			
 
				+		  t_min = exec_t;
			
 
				+	}
			
 
				+
			
 
				+	char filename[64];
			
 
				+	snprintf(filename, 64, "out%lld.ppm", dim);
			
 
				+	pixels2img(pixels,dim,dim,filename);
			
 
				+
			
 
				+	free(pixels);
			
 
				+	free(params);
			
 
				+
			
 
				+	return t_min;
			
 
				+}
			
 
				+
			
 
				+void display_times(double cr, double ci, long long start_dim, long long step_dim, long long stop_dim, long long nslices)
			
 
				+{
			
 
				+
			
 
				+	long long dim;
			
 
				+
			
 
				+	for (dim = start_dim; dim <= stop_dim; dim += step_dim) {
			
 
				+		printf("Dimension: %lld...\n", dim);
			
 
				+		double res = min_times(cr, ci, dim, nslices);
			
 
				+		res = res / dim / dim; // time per pixel
			
 
				+		printf("%lld %lf\n", dim, res);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+int main(int argc, char **argv)
			
 
				+{
			
 
				+	if (argc != 7){
			
 
				+		printf("Usage: %s cr ci start_dim step_dim stop_dim nslices(must divide dims)\n", argv[0]);
			
 
				+		return 1;
			
 
				+	}
			
 
				+	if (starpu_init(NULL) != EXIT_SUCCESS){
			
 
				+		fprintf(stderr, "ERROR\n");
			
 
				+		return 77;
			
 
				+	}
			
 
				+
			
 
				+	double cr = (float) atof(argv[1]);
			
 
				+	double ci = (float) atof(argv[2]);
			
 
				+	long long start_dim = atoll(argv[3]);
			
 
				+	long long step_dim = atoll(argv[4]);
			
 
				+	long long stop_dim = atoll(argv[5]);
			
 
				+	long long nslices = atoll(argv[6]);
			
 
				+
			
 
				+	display_times(cr, ci, start_dim, step_dim, stop_dim, nslices);
			
 
				+
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
--- a/julia/mandelbrot/mandelbrot.jl
+++ b/julia/mandelbrot/mandelbrot.jl
@@ -3,7 +3,7 @@ using StarPU
 
				 using LinearAlgebra
			
 
				 
			
 
				 @target STARPU_CPU+STARPU_CUDA
			
 
				-@codelet function mandelbrot(pixels ::Matrix{Int64}, params ::Matrix{Float32} ) :: Float32
			
 
				+@codelet function mandelbrot(pixels ::Matrix{Int64}, centerr ::Float64, centeri ::Float64, offset ::Int64, dim ::Int64 ) :: Nothing
			
 
				     height :: Int64 = height(pixels)
			
 
				     width :: Int64 = width(pixels)
			
 
				     zoom :: Float64 = width * 0.25296875
			
@@ -11,10 +11,6 @@ using LinearAlgebra
 
				     diverge :: Float32 = 4.0
			
 
				     max_iterations :: Float32 = ((width/2) * 0.049715909 * log10(zoom));
			
 
				     imi :: Float32 = 1. / max_iterations
			
 
				-    centerr :: Float32 = params[1,1]
			
 
				-    centeri :: Float32 = params[2,1]
			
 
				-    offset :: Float32 = params[3,1]
			
 
				-    dim :: Float32 = params[4,1]
			
 
				     cr :: Float64 = 0.
			
 
				     zr :: Float64 = 0.
			
 
				     ci :: Float64 = 0.
			
@@ -27,7 +23,10 @@ using LinearAlgebra
 
				             zr = cr
			
 
				             ci = centeri + (y-1+offset - (dim / 2)) * iz
			
 
				             zi = ci
			
 
				-            for n = 0:max_iterations
			
 
				+            max_it :: Float64 = max_iterations
			
 
				+            n = 0
			
 
				+            for i = 0:max_it
			
 
				+                n = i
			
 
				                 if (zr*zr + zi*zi > diverge)
			
 
				                     break
			
 
				                 end
			
@@ -43,21 +42,20 @@ using LinearAlgebra
 
				             end
			
 
				         end
			
 
				     end
			
 
				-    return 0. :: Float32
			
 
				+
			
 
				+    return
			
 
				 end
			
 
				 
			
 
				-@debugprint "starpu_init"
			
 
				 starpu_init()
			
 
				 
			
 
				-function mandelbrot_with_starpu(A ::Matrix{Int64}, params ::Matrix{Float32}, nslicesx ::Int64)
			
 
				+function mandelbrot_with_starpu(A ::Matrix{Int64}, cr ::Float64, ci ::Float64, dim ::Int64, nslicesx ::Int64)
			
 
				     horiz = StarpuDataFilter(STARPU_MATRIX_FILTER_BLOCK, nslicesx)
			
 
				     @starpu_block let
			
 
				-	hA, hP = starpu_data_register(A,params)
			
 
				+	hA = starpu_data_register(A)
			
 
				 	starpu_data_partition(hA,horiz)
			
 
				-        starpu_data_partition(hP,horiz)
			
 
				-        
			
 
				+
			
 
				 	@starpu_sync_tasks for taskx in (1 : nslicesx)
			
 
				-                @starpu_async_cl mandelbrot(hA[taskx], hP[taskx]) [STARPU_W, STARPU_R]
			
 
				+                @starpu_async_cl mandelbrot(hA[taskx]) [STARPU_W] [cr, ci, (taskx-1)*dim/nslicesx, dim]
			
 
				 	end
			
 
				     end
			
 
				 end
			
@@ -79,16 +77,9 @@ function min_times(cr ::Float64, ci ::Float64, dim ::Int64, nslices ::Int64)
 
				     tmin=0;
			
 
				     
			
 
				     pixels ::Matrix{Int64} = zeros(dim, dim)
			
 
				-    params :: Matrix{Float32} = zeros(4*nslices,1)
			
 
				-    for i=0:(nslices-1)
			
 
				-        params[4*i+1,1] = cr
			
 
				-        params[4*i+2,1] = ci
			
 
				-        params[4*i+3,1] = i*dim/nslices
			
 
				-        params[4*i+4,1] = dim
			
 
				-    end
			
 
				     for i = 1:10
			
 
				         t = time_ns();
			
 
				-        mandelbrot_with_starpu(pixels, params, nslices)
			
 
				+        mandelbrot_with_starpu(pixels, cr, ci, dim, nslices)
			
 
				         t = time_ns()-t
			
 
				         if (tmin==0 || tmin>t)
			
 
				             tmin=t
			
@@ -109,6 +100,5 @@ end
 
				 
			
 
				 display_time(-0.800671,-0.158392,32,32,4096,4)
			
 
				 
			
 
				-@debugprint "starpu_shutdown"
			
 
				 starpu_shutdown()
			
 
				 
			
--- a/julia/examples/mandelbrot/mandelbrot_native.jl
+++ b/julia/examples/mandelbrot/mandelbrot_native.jl
@@ -0,0 +1,96 @@
 
				+using LinearAlgebra
			
 
				+
			
 
				+function mandelbrot(pixels, centerr ::Float64, centeri ::Float64, offset ::Int64, dim ::Int64) :: Nothing
			
 
				+    height :: Int64, width :: Int64 = size(pixels)
			
 
				+    zoom :: Float64 = width * 0.25296875
			
 
				+    iz :: Float64 = 1. / zoom
			
 
				+    diverge :: Float32 = 4.0
			
 
				+    max_iterations :: Float32 = ((width/2) * 0.049715909 * log10(zoom));
			
 
				+    imi :: Float64 = 1. / max_iterations
			
 
				+    cr :: Float64 = 0.
			
 
				+    zr :: Float64 = 0.
			
 
				+    ci :: Float64 = 0.
			
 
				+    zi :: Float64 = 0.
			
 
				+    n :: Int64 = 0
			
 
				+    tmp :: Float64 = 0.
			
 
				+    for y = 1:height
			
 
				+        for x = 1:width
			
 
				+            cr = centerr + (x-1 - (dim / 2)) * iz
			
 
				+            zr = cr
			
 
				+            ci = centeri + (y-1+offset - (dim / 2)) * iz
			
 
				+            zi = ci
			
 
				+            n = 0
			
 
				+            for i = 0:max_iterations
			
 
				+                n = i
			
 
				+                if (zr*zr + zi*zi > diverge)
			
 
				+                    break
			
 
				+                end
			
 
				+                tmp = zr*zr - zi*zi + cr
			
 
				+                zi = 2*zr*zi + ci
			
 
				+                zr = tmp
			
 
				+            end
			
 
				+
			
 
				+            if (n < max_iterations)
			
 
				+                pixels[y,x] = round(15 * n * imi)
			
 
				+            else
			
 
				+                pixels[y,x] = 0
			
 
				+            end
			
 
				+        end
			
 
				+    end
			
 
				+
			
 
				+    return
			
 
				+end
			
 
				+
			
 
				+function mandelbrot_without_starpu(A ::Matrix{Int64}, cr ::Float64, ci ::Float64, dim ::Int64, nslicesx ::Int64)
			
 
				+    width,height = size(A)
			
 
				+    step = height / nslicesx
			
 
				+
			
 
				+    for taskx in (1 : nslicesx)
			
 
				+        start_id = floor(Int64, (taskx-1)*step+1)
			
 
				+        end_id = floor(Int64, (taskx-1)*step+step)
			
 
				+        a = view(A, start_id:end_id, :)
			
 
				+
			
 
				+        offset ::Int64 = (taskx-1)*dim/nslicesx
			
 
				+        mandelbrot(a, cr, ci, offset, dim)
			
 
				+    end
			
 
				+end
			
 
				+
			
 
				+function pixels2img(pixels ::Matrix{Int64}, width ::Int64, height ::Int64, filename ::String)
			
 
				+    MAPPING = [[66,30,15],[25,7,26],[9,1,47],[4,4,73],[0,7,100],[12,44,138],[24,82,177],[57,125,209],[134,181,229],[211,236,248],[241,233,191],[248,201,95],[255,170,0],[204,128,0],[153,87,0],[106,52,3]]
			
 
				+    open(filename, "w") do f
			
 
				+        write(f, "P3\n$width $height\n255\n")
			
 
				+        for i = 1:height
			
 
				+            for j = 1:width
			
 
				+                write(f,"$(MAPPING[1+pixels[i,j]][1]) $(MAPPING[1+pixels[i,j]][2]) $(MAPPING[1+pixels[i,j]][3]) ")
			
 
				+            end
			
 
				+            write(f, "\n")
			
 
				+        end
			
 
				+    end
			
 
				+end
			
 
				+
			
 
				+function min_times(cr ::Float64, ci ::Float64, dim ::Int64, nslices ::Int64)
			
 
				+    tmin=0;
			
 
				+
			
 
				+    pixels ::Matrix{Int64} = zeros(dim, dim)
			
 
				+    for i = 1:10
			
 
				+        t = time_ns();
			
 
				+        mandelbrot_without_starpu(pixels, cr, ci, dim, nslices)
			
 
				+        t = time_ns()-t
			
 
				+        if (tmin==0 || tmin>t)
			
 
				+            tmin=t
			
 
				+        end
			
 
				+    end
			
 
				+    pixels2img(pixels,dim,dim,"out$(dim).ppm")
			
 
				+    return tmin
			
 
				+end
			
 
				+
			
 
				+function display_time(cr ::Float64, ci ::Float64, start_dim ::Int64, step_dim ::Int64, stop_dim ::Int64, nslices ::Int64)
			
 
				+    for dim in (start_dim : step_dim : stop_dim)
			
 
				+        res = min_times(cr, ci, dim, nslices)
			
 
				+        res=res/dim/dim; # time per pixel
			
 
				+        println("$(dim) $(res)")
			
 
				+    end
			
 
				+end
			
 
				+
			
 
				+
			
 
				+display_time(-0.800671,-0.158392,32,32,4096,4)
			
--- a/julia/mult/Makefile
+++ b/julia/mult/Makefile
@@ -6,40 +6,52 @@ STRIDE=72
 
				 #CFLAGS=-restrict -unroll4 -ipo -falign-loops=256 -O3 -DSTRIDE=${STRIDE} -march=native $(shell pkg-config --cflags starpu-1.3)
			
 
				 # GCC compiler
			
 
				 CC=gcc
			
 
				-CFLAGS += -O3 -DSTRIDE=${STRIDE} -mavx -fomit-frame-pointer -march=native -ffast-math $(shell pkg-config --cflags starpu-1.3)
			
 
				+NVCC=nvcc
			
 
				+ENABLE_CUDA=no
			
 
				+LD=$(CC)
			
 
				 
			
 
				+ifeq ($(ENABLE_CUDA),yes)
			
 
				+        LD := ${NVCC}
			
 
				+endif
			
 
				+
			
 
				+CFLAGS = -O3 -g -DSTRIDE=${STRIDE} $(shell pkg-config --cflags starpu-1.3)
			
 
				+CPU_CFLAGS = ${CFLAGS} -Wall -mavx -fomit-frame-pointer -march=native -ffast-math
			
 
				+CUDA_CFLAGS = ${CFLAGS}
			
 
				 LDFLAGS +=$(shell pkg-config --libs starpu-1.3)
			
 
				+
			
 
				 EXTERNLIB=extern_tasks.so
			
 
				 GENERATEDLIB=generated_tasks.so
			
 
				-#OBJECTS=$(patsubst %.c,%.o,$(wildcard gen*.c))
			
 
				-OBJECTS=$(wildcard gen*.c)
			
 
				+
			
 
				+C_OBJECTS=$(patsubst %.c,%.o,$(wildcard gen*.c))
			
 
				+CUDA_OBJECTS=$(patsubst %.cu,%.o,$(wildcard gen*.cu))
			
 
				+ifneq ($(ENABLE_CUDA),yes)
			
 
				+	CUDA_OBJECTS:=
			
 
				+endif
			
 
				+
			
 
				+
			
 
				 LIBPATH=${PWD}/../StarPU.jl/lib
			
 
				 
			
 
				 all: ${EXTERNLIB}
			
 
				 
			
 
				 mult: mult.c cpu_mult.o #gpu_mult.o
			
 
				-	$(CC) $(CFLAGS) $^ -o $@ $(LDFLAGS)
			
 
				-
			
 
				-gpu_mult.o: gpu_mult.cu
			
 
				-	nvcc -c $(CFLAGS) $^ -o $@
			
 
				+	$(CC) $(CPU_CFLAGS) $^ -o $@ $(LDFLAGS)
			
 
				 
			
 
				 %.o: %.c
			
 
				-	$(CC) -c $(CFLAGS) $^ -o $@
			
 
				+	$(CC) -c -fPIC $(CPU_CFLAGS) $^ -o $@
			
 
				+
			
 
				+%.o: %.cu
			
 
				+	$(NVCC) -dc $(CUDA_CFLAGS) $^ --shared --compiler-options '-fPIC' -o $@ $(LDFLAGS)
			
 
				 
			
 
				 ${EXTERNLIB}: cpu_mult.c
			
 
				 	$(CC) $(CFLAGS) -shared -fPIC $(LDFLAGS) $^ -o $@
			
 
				 
			
 
				-gpu_mult.so: gpu_mult.o
			
 
				-	nvcc $(CFLAGS) $^ --shared --compiler-options '-fPIC' -o $@ $(LDFLAGS)
			
 
				+${GENERATEDLIB}: $(C_OBJECTS) $(CUDA_OBJECTS)
			
 
				+	$(LD) -shared $(LDFLAGS) $^ -o $@
			
 
				 
			
 
				-cpu_mult_sa: cpu_mult_sa.o
			
 
				-	$(CC) $(CFLAGS) $^ -o $@ $(LDFLAGS)
			
 
				-
			
 
				-${GENERATEDLIB}: ${OBJECTS}
			
 
				-	$(CC) $(CFLAGS) -shared -fPIC $(LDFLAGS) $^ -o $@
			
 
				+.PHONY: clean
			
 
				 
			
 
				 clean:
			
 
				-	rm -f mult *.so *.o c_*.genc gencuda_*.cu *.dat
			
 
				+	rm -f mult *.so *.o genc_*.c gencuda_*.cu *.dat
			
 
				 
			
 
				 # Performance Tests
			
 
				 cstarpu.dat: mult
			
--- a/julia/examples/mult/README
+++ b/julia/examples/mult/README
--- a/julia/examples/mult/cpu_mult.c
+++ b/julia/examples/mult/cpu_mult.c
--- a/julia/examples/mult/gpu_mult.cu
+++ b/julia/examples/mult/gpu_mult.cu
--- a/julia/examples/mult/mult.c
+++ b/julia/examples/mult/mult.c
--- a/julia/mult/mult.jl
+++ b/julia/mult/mult.jl
@@ -6,58 +6,57 @@ using LinearAlgebra
 
				 const STRIDE = 72
			
 
				 
			
 
				 @target STARPU_CPU+STARPU_CUDA
			
 
				-@codelet function matrix_mult(m1 :: Matrix{Float32}, m2 :: Matrix{Float32}, m3 :: Matrix{Float32}) :: Float32
			
 
				+@codelet function matrix_mult(m1 :: Matrix{Float32}, m2 :: Matrix{Float32}, m3 :: Matrix{Float32}) :: Nothing
			
 
				 
			
 
				     width_m2 :: Int32 = width(m2)
			
 
				     height_m1 :: Int32 = height(m1)
			
 
				     width_m1 :: Int32 = width(m1)
			
 
				     # Naive version
			
 
				-    #@parallel for j in (1 : width_m2)
			
 
				-    #    @parallel for i in (1 : height_m1)
			
 
				-    #
			
 
				-    #          sum :: Float32 = 0.
			
 
				-
			
 
				-    #          for k in (1 : width_m1)
			
 
				-    #              sum = sum + m1[i, k] * m2[k, j]
			
 
				-    #          end
			
 
				+    @parallel for j in (1 : width_m2)
			
 
				+       @parallel for i in (1 : height_m1)
			
 
				     
			
 
				-    #          m3[i, j] = sum
			
 
				-    #      end
			
 
				-    #  end
			
 
				-    ##### Tiled and unrolled version 
			
 
				-    for l in (1 : width_m2)
			
 
				-        for m in (1 : height_m1)
			
 
				-            m3[m,l] = 0
			
 
				-        end
			
 
				-    end
			
 
				-    @parallel for i in (1 : STRIDE : height_m1)
			
 
				-        for k in (1 : STRIDE : width_m1 )
			
 
				-            for j in (1 : STRIDE : width_m2  )
			
 
				-                for kk in (k : 4 : k+STRIDE-1)
			
 
				-                    for jj in (j : 2 : j+STRIDE-1)
			
 
				-                        alpha00 :: Float32 =m2[kk,jj]
			
 
				-                        alpha01 :: Float32 =m2[kk,jj+1]
			
 
				-                        alpha10 :: Float32 =m2[kk+1,jj]
			
 
				-                        alpha11 :: Float32 =m2[kk+1,jj+1]
			
 
				-                        alpha20 :: Float32 =m2[kk+2,jj]
			
 
				-                        alpha21 :: Float32 =m2[kk+2,jj+1]
			
 
				-                        alpha30 :: Float32 =m2[kk+3,jj]
			
 
				-                        alpha31 :: Float32 =m2[kk+3,jj+1]
			
 
				-                        for ii in (i : 1 : i+STRIDE-1) 
			
 
				-                            m3[ii, jj] = m3[ii, jj] + m1[ii, kk] * alpha00 + m1[ii, kk+1] * alpha10 + m1[ii, kk+2] * alpha20 + m1[ii,kk+3]*alpha30
			
 
				-                            m3[ii, jj+1] = m3[ii, jj+1] + m1[ii, kk] * alpha01 + m1[ii, kk+1] * alpha11 + m1[ii, kk+2]*alpha21 + m1[ii,kk+3]*alpha31 
			
 
				-                        end
			
 
				-                    end
			
 
				-                end
			
 
				-            end
			
 
				-        end
			
 
				-    end
			
 
				+             sum :: Float32 = 0.
			
 
				 
			
 
				-    return 0. :: Float32
			
 
				+             for k in (1 : width_m1)
			
 
				+                 sum = sum + m1[i, k] * m2[k, j]
			
 
				+             end
			
 
				+    
			
 
				+             m3[i, j] = sum
			
 
				+         end
			
 
				+     end
			
 
				+    # ##### Tiled and unrolled version 
			
 
				+    # for l in (1 : width_m2)
			
 
				+    #     for m in (1 : height_m1)
			
 
				+    #         m3[m,l] = 0
			
 
				+    #     end
			
 
				+    # end
			
 
				+    # @parallel for i in (1 : STRIDE : height_m1)
			
 
				+    #     for k in (1 : STRIDE : width_m1 )
			
 
				+    #         for j in (1 : STRIDE : width_m2  )
			
 
				+    #             for kk in (k : 4 : k+STRIDE-1)
			
 
				+    #                 for jj in (j : 2 : j+STRIDE-1)
			
 
				+    #                     alpha00 :: Float32 =m2[kk,jj]
			
 
				+    #                     alpha01 :: Float32 =m2[kk,jj+1]
			
 
				+    #                     alpha10 :: Float32 =m2[kk+1,jj]
			
 
				+    #                     alpha11 :: Float32 =m2[kk+1,jj+1]
			
 
				+    #                     alpha20 :: Float32 =m2[kk+2,jj]
			
 
				+    #                     alpha21 :: Float32 =m2[kk+2,jj+1]
			
 
				+    #                     alpha30 :: Float32 =m2[kk+3,jj]
			
 
				+    #                     alpha31 :: Float32 =m2[kk+3,jj+1]
			
 
				+    #                     for ii in (i : 1 : i+STRIDE-1) 
			
 
				+    #                         m3[ii, jj] = m3[ii, jj] + m1[ii, kk] * alpha00 + m1[ii, kk+1] * alpha10 + m1[ii, kk+2] * alpha20 + m1[ii,kk+3]*alpha30
			
 
				+    #                         m3[ii, jj+1] = m3[ii, jj+1] + m1[ii, kk] * alpha01 + m1[ii, kk+1] * alpha11 + m1[ii, kk+2]*alpha21 + m1[ii,kk+3]*alpha31 
			
 
				+    #                     end
			
 
				+    #                 end
			
 
				+    #             end
			
 
				+    #         end
			
 
				+    #     end
			
 
				+    # end
			
 
				+
			
 
				+    return
			
 
				 end
			
 
				 
			
 
				 
			
 
				-@debugprint "starpu_init"
			
 
				 starpu_init()
			
 
				 
			
 
				 function multiply_with_starpu(A :: Matrix{Float32}, B :: Matrix{Float32}, C :: Matrix{Float32}, nslicesx, nslicesy)
			
@@ -77,7 +76,7 @@ function multiply_with_starpu(A :: Matrix{Float32}, B :: Matrix{Float32}, C :: M
 
				         )
			
 
				         cl = StarpuCodelet(
			
 
				             cpu_func = CPU_CODELETS["matrix_mult"],
			
 
				-            #cuda_func = "matrix_mult",
			
 
				+            # cuda_func = CUDA_CODELETS["matrix_mult"],
			
 
				             #opencl_func="ocl_matrix_mult",
			
 
				             modes = [STARPU_R, STARPU_R, STARPU_W],
			
 
				             perfmodel = perfmodel
			
@@ -141,6 +140,6 @@ end
 
				 io=open(ARGS[1],"w")
			
 
				 compute_times(io,16*STRIDE,4*STRIDE,4096,2,2)
			
 
				 close(io)
			
 
				-@debugprint "starpu_shutdown"
			
 
				+
			
 
				 starpu_shutdown()
			
 
				 
			
--- a/julia/examples/mult/mult.plot
+++ b/julia/examples/mult/mult.plot
--- a/julia/mult/mult_native.jl
+++ b/julia/mult/mult_native.jl
@@ -5,9 +5,6 @@ using LinearAlgebra
 
				 #shoud be the same as in the makefile
			
 
				 const STRIDE = 72
			
 
				 
			
 
				-@debugprint "starpu_init"
			
 
				-starpu_init()
			
 
				-
			
 
				 function multiply_without_starpu(A :: Matrix{Float32}, B :: Matrix{Float32}, C :: Matrix{Float32}, nslicesx, nslicesy)
			
 
				     tmin = 0
			
 
				     for i in (1 : 10 )
			
@@ -39,6 +36,4 @@ end
 
				 io=open(ARGS[1],"w")
			
 
				 compute_times(io,16*STRIDE,4*STRIDE,4096,2,2)
			
 
				 close(io)
			
 
				-@debugprint "starpu_shutdown"
			
 
				-starpu_shutdown()
			
 
				 
			
--- a/julia/examples/mult/res/mult_cstarpu_gcc9_s72_2x2_b4x2.dat
+++ b/julia/examples/mult/res/mult_cstarpu_gcc9_s72_2x2_b4x2.dat
--- a/julia/examples/mult/res/mult_gen_gcc9_1x4.dat
+++ b/julia/examples/mult/res/mult_gen_gcc9_1x4.dat
--- a/julia/examples/mult/res/mult_gen_gcc9_4x1.dat
+++ b/julia/examples/mult/res/mult_gen_gcc9_4x1.dat
--- a/julia/examples/mult/res/mult_gen_gcc9_s100_4x1.dat
+++ b/julia/examples/mult/res/mult_gen_gcc9_s100_4x1.dat
--- a/julia/examples/mult/res/mult_gen_gcc9_s50_4x1.dat
+++ b/julia/examples/mult/res/mult_gen_gcc9_s50_4x1.dat
--- a/julia/examples/mult/res/mult_gen_gcc9_s64_16x16_b4x2.dat
+++ b/julia/examples/mult/res/mult_gen_gcc9_s64_16x16_b4x2.dat
--- a/julia/examples/mult/res/mult_gen_gcc9_s64_4x4_b4x2.dat
+++ b/julia/examples/mult/res/mult_gen_gcc9_s64_4x4_b4x2.dat
--- a/julia/examples/mult/res/mult_gen_gcc9_s64_8x1_b4x2.dat
+++ b/julia/examples/mult/res/mult_gen_gcc9_s64_8x1_b4x2.dat
--- a/julia/examples/mult/res/mult_gen_gcc9_s64_8x8_b4x2.dat
+++ b/julia/examples/mult/res/mult_gen_gcc9_s64_8x8_b4x2.dat
--- a/julia/examples/mult/res/mult_gen_gcc9_s72_16x18_b4x2.dat
+++ b/julia/examples/mult/res/mult_gen_gcc9_s72_16x18_b4x2.dat
--- a/julia/examples/mult/res/mult_gen_gcc9_s72_16x8_b4x2.dat
+++ b/julia/examples/mult/res/mult_gen_gcc9_s72_16x8_b4x2.dat
--- a/julia/examples/mult/res/mult_gen_gcc9_s72_2x2.dat
+++ b/julia/examples/mult/res/mult_gen_gcc9_s72_2x2.dat
--- a/julia/examples/mult/res/mult_gen_gcc9_s72_2x2_b4x2.dat
+++ b/julia/examples/mult/res/mult_gen_gcc9_s72_2x2_b4x2.dat
--- a/julia/examples/mult/res/mult_gen_gcc9_s72_2x2_b4x4.dat
+++ b/julia/examples/mult/res/mult_gen_gcc9_s72_2x2_b4x4.dat
--- a/julia/examples/mult/res/mult_gen_gcc9_s72_2x2_b8x2.dat
+++ b/julia/examples/mult/res/mult_gen_gcc9_s72_2x2_b8x2.dat
--- a/julia/examples/mult/res/mult_gen_gcc9_s72_4x1.dat
+++ b/julia/examples/mult/res/mult_gen_gcc9_s72_4x1.dat
--- a/julia/examples/mult/res/mult_gen_gcc9_s72_4x4_b4x2.dat
+++ b/julia/examples/mult/res/mult_gen_gcc9_s72_4x4_b4x2.dat
--- a/julia/examples/mult/res/mult_gen_gcc9_s72_8x8_b4x2.dat
+++ b/julia/examples/mult/res/mult_gen_gcc9_s72_8x8_b4x2.dat
--- a/julia/examples/mult/res/mult_gen_gcc9_s80_4x1.dat
+++ b/julia/examples/mult/res/mult_gen_gcc9_s80_4x1.dat
--- a/julia/examples/mult/res/mult_gen_icc_s72_2x1_b4x2.dat
+++ b/julia/examples/mult/res/mult_gen_icc_s72_2x1_b4x2.dat
--- a/julia/examples/mult/res/mult_gen_icc_s72_4x4_b4x2.dat
+++ b/julia/examples/mult/res/mult_gen_icc_s72_4x4_b4x2.dat
--- a/julia/examples/mult/res/mult_native.dat
+++ b/julia/examples/mult/res/mult_native.dat
--- a/julia/examples/mult/res/mult_nogen_gcc9_s72_2x2_b2x2.dat
+++ b/julia/examples/mult/res/mult_nogen_gcc9_s72_2x2_b2x2.dat
--- a/julia/examples/mult/res/mult_nogen_gcc9_s72_2x2_b4x2.dat
+++ b/julia/examples/mult/res/mult_nogen_gcc9_s72_2x2_b4x2.dat
--- a/julia/examples/mult/res/mult_nogen_icc_s72-36_2x2_b4x2.dat
+++ b/julia/examples/mult/res/mult_nogen_icc_s72-36_2x2_b4x2.dat
--- a/julia/examples/mult/res/mult_nogen_icc_s72_2x2_b4x2.dat
+++ b/julia/examples/mult/res/mult_nogen_icc_s72_2x2_b4x2.dat
--- a/julia/examples/mult/res/mult_nogen_icc_s72x2_2x2_b4x2.dat
+++ b/julia/examples/mult/res/mult_nogen_icc_s72x2_2x2_b4x2.dat
--- a/julia/examples/old_examples/Makefile.mk
+++ b/julia/examples/old_examples/Makefile.mk
--- a/julia/examples/old_examples/README
+++ b/julia/examples/old_examples/README
--- a/julia/examples/old_examples/cpu_mult.c
+++ b/julia/examples/old_examples/cpu_mult.c
--- a/julia/examples/old_examples/gpu_mult.cu
+++ b/julia/examples/old_examples/gpu_mult.cu
--- a/julia/examples/old_examples/includes/display.c
+++ b/julia/examples/old_examples/includes/display.c
--- a/julia/examples/old_examples/includes/display.h
+++ b/julia/examples/old_examples/includes/display.h
--- a/julia/examples/old_examples/includes/sorting.c
+++ b/julia/examples/old_examples/includes/sorting.c
--- a/julia/examples/old_examples/includes/sorting.h
+++ b/julia/examples/old_examples/includes/sorting.h
--- a/julia/examples/old_examples/mandelbrot/cpu_cuda_mandelbrot.jl
+++ b/julia/examples/old_examples/mandelbrot/cpu_cuda_mandelbrot.jl
--- a/julia/examples/old_examples/mandelbrot/cpu_mandelbrot.c
+++ b/julia/examples/old_examples/mandelbrot/cpu_mandelbrot.c
--- a/julia/examples/old_examples/mandelbrot/cpu_mandelbrot_between.c
+++ b/julia/examples/old_examples/mandelbrot/cpu_mandelbrot_between.c
--- a/julia/examples/old_examples/mandelbrot/gpu_mandelbrot.cu
+++ b/julia/examples/old_examples/mandelbrot/gpu_mandelbrot.cu
--- a/julia/examples/old_examples/mandelbrot/gpu_mandelbrot_between.cu
+++ b/julia/examples/old_examples/mandelbrot/gpu_mandelbrot_between.cu
--- a/julia/examples/old_examples/mandelbrot/mandelbrot.c
+++ b/julia/examples/old_examples/mandelbrot/mandelbrot.c
--- a/julia/examples/old_examples/mandelbrot/mandelbrot.jl
+++ b/julia/examples/old_examples/mandelbrot/mandelbrot.jl
--- a/julia/examples/old_examples/mandelbrot/mandelbrot_between.c
+++ b/julia/examples/old_examples/mandelbrot/mandelbrot_between.c
--- a/julia/examples/old_examples/mandelbrot/mandelbrot_def.jl
+++ b/julia/examples/old_examples/mandelbrot/mandelbrot_def.jl
--- a/julia/examples/old_examples/mandelbrot/mandelbrot_generated.jl
+++ b/julia/examples/old_examples/mandelbrot/mandelbrot_generated.jl
--- a/julia/examples/old_examples/mult.c
+++ b/julia/examples/old_examples/mult.c
--- a/julia/examples/old_examples/mult/cpu_cuda_mult.jl
+++ b/julia/examples/old_examples/mult/cpu_cuda_mult.jl
--- a/julia/examples/old_examples/mult/cpu_mult.c
+++ b/julia/examples/old_examples/mult/cpu_mult.c
--- a/julia/examples/old_examples/mult/gpu_mult.cu
+++ b/julia/examples/old_examples/mult/gpu_mult.cu
--- a/julia/examples/old_examples/mult/mult.c
+++ b/julia/examples/old_examples/mult/mult.c
--- a/julia/examples/old_examples/mult/mult_def.jl
+++ b/julia/examples/old_examples/mult/mult_def.jl
--- a/julia/examples/old_examples/mult/mult_extern.jl
+++ b/julia/examples/old_examples/mult/mult_extern.jl
--- a/julia/examples/old_examples/mult/mult_extern_graph.jl
+++ b/julia/examples/old_examples/mult/mult_extern_graph.jl
--- a/julia/examples/old_examples/mult/mult_generated.jl
+++ b/julia/examples/old_examples/mult/mult_generated.jl
--- a/julia/examples/old_examples/mult/mult_generated_graph.jl
+++ b/julia/examples/old_examples/mult/mult_generated_graph.jl
--- a/julia/examples/old_examples/mult/mult_naive.jl
+++ b/julia/examples/old_examples/mult/mult_naive.jl
--- a/julia/examples/old_examples/nbody/cpu_cuda_nbody.jl
+++ b/julia/examples/old_examples/nbody/cpu_cuda_nbody.jl
--- a/julia/examples/old_examples/nbody/cpu_nbody.c
+++ b/julia/examples/old_examples/nbody/cpu_nbody.c
--- a/julia/examples/old_examples/nbody/cpu_nbody_between.c
+++ b/julia/examples/old_examples/nbody/cpu_nbody_between.c
--- a/julia/examples/old_examples/nbody/gpu_nbody.cu
+++ b/julia/examples/old_examples/nbody/gpu_nbody.cu
--- a/julia/examples/old_examples/nbody/gpu_nbody_between.cu
+++ b/julia/examples/old_examples/nbody/gpu_nbody_between.cu
--- a/julia/examples/old_examples/nbody/nbody.c
+++ b/julia/examples/old_examples/nbody/nbody.c
--- a/julia/examples/old_examples/nbody/nbody.jl
+++ b/julia/examples/old_examples/nbody/nbody.jl
--- a/julia/examples/old_examples/nbody/nbody_between.c
+++ b/julia/examples/old_examples/nbody/nbody_between.c
--- a/julia/examples/old_examples/nbody/nbody_def.jl
+++ b/julia/examples/old_examples/nbody/nbody_def.jl
--- a/julia/examples/old_examples/nbody/nbody_display.jl
+++ b/julia/examples/old_examples/nbody/nbody_display.jl
--- a/julia/examples/old_examples/nbody/nbody_generated.jl
+++ b/julia/examples/old_examples/nbody/nbody_generated.jl
--- a/julia/examples/task_insert_color/Makefile
+++ b/julia/examples/task_insert_color/Makefile
@@ -0,0 +1,51 @@
 
				+CC=gcc
			
 
				+NVCC=nvcc
			
 
				+ENABLE_CUDA=no
			
 
				+LD=$(CC)
			
 
				+
			
 
				+ifeq ($(ENABLE_CUDA),yes)
			
 
				+        LD := ${NVCC}
			
 
				+endif
			
 
				+
			
 
				+CFLAGS = -O3 -g $(shell pkg-config --cflags starpu-1.3)
			
 
				+CPU_CFLAGS = ${CFLAGS} -Wall -mavx -fomit-frame-pointer -march=native -ffast-math
			
 
				+CUDA_CFLAGS = ${CFLAGS}
			
 
				+LDFLAGS +=$(shell pkg-config --libs starpu-1.3)
			
 
				+
			
 
				+EXTERNLIB=extern_tasks.so
			
 
				+GENERATEDLIB=generated_tasks.so
			
 
				+
			
 
				+C_OBJECTS=$(patsubst %.c,%.o,$(wildcard gen*.c))
			
 
				+CUDA_OBJECTS=$(patsubst %.cu,%.o,$(wildcard gen*.cu))
			
 
				+ifneq ($(ENABLE_CUDA),yes)
			
 
				+	CUDA_OBJECTS:=
			
 
				+endif
			
 
				+
			
 
				+LIBPATH=${PWD}/../StarPU.jl/lib
			
 
				+
			
 
				+all: task_insert_color
			
 
				+
			
 
				+task_insert_color: task_insert_color.o
			
 
				+	$(CC) $(CPU_CFLAGS) $^ -o $@ $(LDFLAGS)
			
 
				+
			
 
				+%.o: %.c
			
 
				+	$(CC) -c -fPIC $(CPU_CFLAGS) $^ -o $@
			
 
				+
			
 
				+%.o: %.cu
			
 
				+	$(NVCC) -dc $(CUDA_CFLAGS) $^ --shared --compiler-options '-fPIC' -o $@ $(LDFLAGS)
			
 
				+
			
 
				+${GENERATEDLIB}: $(C_OBJECTS) $(CUDA_OBJECTS)
			
 
				+	$(LD) -shared $(LDFLAGS) $^ -o $@
			
 
				+
			
 
				+PHONY: clean
			
 
				+
			
 
				+clean:
			
 
				+	rm -f vector_scal *.so *.o genc_*.c gencuda_*.cu *.dat
			
 
				+
			
 
				+# Performance Tests
			
 
				+cstarpu.dat: task_insert_color
			
 
				+	STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 ./task_insert_color > $@
			
 
				+julia_generatedc.dat:
			
 
				+	LD_LIBRARY_PATH+=${LIBPATH} STARPU_NOPENCL=0 STARPU_SCHED=dmda STARPU_CALIBRATE=1 julia task_insert_colorl.jl
			
 
				+
			
 
				+test: cstarpu.dat julia_generatedc.dat
			
--- a/julia/examples/task_insert_color/task_insert_color.c
+++ b/julia/examples/task_insert_color/task_insert_color.c
@@ -0,0 +1,89 @@
 
				+/* StarPU --- Runtime system for heterogeneous multicore architectures.
			
 
				+ *
			
 
				+ * Copyright (C) 2018-2020  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
			
 
				+ *
			
 
				+ * StarPU is free software; you can redistribute it and/or modify
			
 
				+ * it under the terms of the GNU Lesser General Public License as published by
			
 
				+ * the Free Software Foundation; either version 2.1 of the License, or (at
			
 
				+ * your option) any later version.
			
 
				+ *
			
 
				+ * StarPU is distributed in the hope that it will be useful, but
			
 
				+ * WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				+ *
			
 
				+ * See the GNU Lesser General Public License in COPYING.LGPL for more details.
			
 
				+ */
			
 
				+
			
 
				+#include <starpu.h>
			
 
				+
			
 
				+#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
			
 
				+
			
 
				+void func(void *descr[], void *_args)
			
 
				+{
			
 
				+	int *x = (int *)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				+	(void)_args;
			
 
				+
			
 
				+	*x *= 2;
			
 
				+}
			
 
				+
			
 
				+struct starpu_codelet mycodelet =
			
 
				+{
			
 
				+	.modes = { STARPU_RW },
			
 
				+	.cpu_funcs = {func},
			
 
				+	.cpu_funcs_name = {"func"},
			
 
				+        .nbuffers = 1
			
 
				+};
			
 
				+
			
 
				+struct starpu_codelet mycodelet_color =
			
 
				+{
			
 
				+	.modes = { STARPU_RW },
			
 
				+	.cpu_funcs = {func},
			
 
				+	.cpu_funcs_name = {"func"},
			
 
				+        .nbuffers = 1,
			
 
				+	.color = 0x0000FF,
			
 
				+};
			
 
				+
			
 
				+int main(void)
			
 
				+{
			
 
				+	unsigned i;
			
 
				+	int value=42;
			
 
				+	starpu_data_handle_t handle;
			
 
				+	int ret;
			
 
				+
			
 
				+	ret = starpu_init(NULL);
			
 
				+	if (ret == -ENODEV) goto enodev;
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
			
 
				+
			
 
				+	starpu_variable_data_register(&handle, STARPU_MAIN_RAM, (uintptr_t)&value, sizeof(value));
			
 
				+
			
 
				+	// In the trace file, the following task should be green (executed on CPU)
			
 
				+	ret = starpu_task_insert(&mycodelet, STARPU_RW, handle, STARPU_NAME, "mytask",
			
 
				+				 0);
			
 
				+	if (STARPU_UNLIKELY(ret == -ENODEV))
			
 
				+	{
			
 
				+		starpu_data_unregister(handle);
			
 
				+		goto enodev;
			
 
				+	}
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
			
 
				+
			
 
				+	// In the trace file, the following task will be red as specified by STARPU_TASK_COLOR
			
 
				+	ret = starpu_task_insert(&mycodelet, STARPU_RW, handle, STARPU_NAME, "mytask",
			
 
				+				 STARPU_TASK_COLOR, 0xFF0000,
			
 
				+				 0);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
			
 
				+
			
 
				+	// In the trace file, the following task will be blue as specified by the field color of mycodelet_color
			
 
				+	ret = starpu_task_insert(&mycodelet_color, STARPU_RW, handle, STARPU_NAME, "mytask",
			
 
				+				 0);
			
 
				+	STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
			
 
				+
			
 
				+	starpu_task_wait_for_all();
			
 
				+	starpu_data_unregister(handle);
			
 
				+
			
 
				+	starpu_shutdown();
			
 
				+
			
 
				+	return 0;
			
 
				+
			
 
				+ enodev:
			
 
				+	return 77;
			
 
				+}
			
--- a/julia/examples/task_insert_color/task_insert_color.jl
+++ b/julia/examples/task_insert_color/task_insert_color.jl
@@ -0,0 +1,48 @@
 
				+import Libdl
			
 
				+using StarPU
			
 
				+
			
 
				+@target STARPU_CPU
			
 
				+@codelet function task_insert_color(val ::Ref{Int32}) :: Nothing
			
 
				+    val[] = val[] * 2
			
 
				+
			
 
				+    return
			
 
				+end
			
 
				+
			
 
				+starpu_init()
			
 
				+
			
 
				+function task_insert_color_with_starpu(val ::Ref{Int32})
			
 
				+    @starpu_block let
			
 
				+	hVal = starpu_data_register(val)
			
 
				+
			
 
				+        cl1 = StarpuCodelet(
			
 
				+            cpu_func = CPU_CODELETS["task_insert_color"],
			
 
				+            modes = [STARPU_RW]
			
 
				+        )
			
 
				+
			
 
				+        cl2 = StarpuCodelet(
			
 
				+            cpu_func = CPU_CODELETS["task_insert_color"],
			
 
				+            modes = [STARPU_RW],
			
 
				+            color = 0x0000FF
			
 
				+        )
			
 
				+
			
 
				+	@starpu_sync_tasks begin
			
 
				+
			
 
				+            # In the trace file, the following task should be green (executed on CPU)
			
 
				+            starpu_task_submit(StarpuTask(cl = cl1, handles = [hVal]))
			
 
				+
			
 
				+            # In the trace file, the following task will be blue as specified by the field color of cl2
			
 
				+            starpu_task_submit(StarpuTask(cl = cl2, handles = [hVal]))
			
 
				+
			
 
				+            # In the trace file, the following tasks will be red as specified in @starpu_async_cl
			
 
				+            @starpu_async_cl task_insert_color(hVal) [STARPU_RW] [] 0xFF0000
			
 
				+
			
 
				+	end
			
 
				+    end
			
 
				+end
			
 
				+
			
 
				+
			
 
				+foo = Ref(convert(Int32, 42))
			
 
				+
			
 
				+task_insert_color_with_starpu(foo)
			
 
				+
			
 
				+starpu_shutdown()
			
--- a/julia/examples/variable/Makefile
+++ b/julia/examples/variable/Makefile