5 years ago · 584a70d891
--- a/ChangeLog
+++ b/ChangeLog
@@ -31,9 +31,20 @@ New features:
 
																     files. This file can be parsed by the new script
															
 
																     starpu_fxt_number_events_to_names.py to convert event keys to event names.
															
 
																   * New STARPU_PER_WORKER perfmodel.
															
 
																+  * Add energy accounting in the simgrid mode: starpu_energy_use() and
															
 
																+    starpu_energy_used().
															
 
																+  * New function starpu_mpi_get_thread_cpuid() to know where is bound the MPI
															
 
																+    thread.
															
 
																+  * New function starpu_get_pu_os_index() to convert logical index of a PU to
															
 
																+    its OS index.
															
 
																+  * New function starpu_get_hwloc_topology() to get the hwloc topology used by
															
 
																+    StarPU.
															
 
																+  * Add a task prefetch level, to improve retaining data in accelerators so we
															
 
																+    can make prefetch more aggressive.
															
 
																 Small changes:
															
 
																   * Use the S4U interface of Simgrid instead of xbt and MSG.
															
 
																+  * Add a synthetic energy efficiency testcase.
															
 
																 StarPU 1.3.4 (git revision xxx)
															
 
																 ==============================================
															
--- a/configure.ac
+++ b/configure.ac
@@ -1459,6 +1459,9 @@ if test x$enable_cuda = xyes; then
 
																 	    ]
															
 
																 	)
															
 
																 	if test x$have_valid_nvml = xyes ; then
															
 
																+		AC_CHECK_DECLS([nvmlDeviceGetTotalEnergyConsumption], [
															
 
																+			AC_CHECK_FUNCS([nvmlDeviceGetTotalEnergyConsumption])
															
 
																+			], [], [[#include <nvml.h>]])
															
 
																 		AC_DEFINE([HAVE_LIBNVIDIA_ML], [1], [Define to 1 if you have the nvidia-ml library])
															
 
																 		STARPU_CUDA_LDFLAGS="$STARPU_CUDA_LDFLAGS -lnvidia-ml"
															
 
																 	fi
															
@@ -2321,6 +2324,14 @@ AC_MSG_RESULT($nmaxbuffers)
 
																 AC_DEFINE_UNQUOTED(STARPU_NMAXBUFS, [$nmaxbuffers],
															
 
																 		[how many buffers can be manipulated per task])
															
 
																+AC_MSG_CHECKING(how many MPI nodes fxt files can be manipulated when generating traces)
															
 
																+AC_ARG_ENABLE(fxt-max-files, [AS_HELP_STRING([--enable-fxt-max-files=<nbuffers>],
															
 
																+			[maximum number of mpi nodes for traces])],
															
 
																+			nmaxfxtfiles=$enableval, nmaxfxtfiles=64)
															
 
																+AC_MSG_RESULT($nmaxfxtfiles)
															
 
																+AC_DEFINE_UNQUOTED(STARPU_FXT_MAX_FILES, [$nmaxfxtfiles],
															
 
																+		[how many MPI nodes fxt files can be manipulated when generating traces])
															
 
																+
															
 
																 AC_MSG_CHECKING(maximum number of memory nodes to use per MPI rank)
															
 
																 AC_ARG_ENABLE(maxnodes, [AS_HELP_STRING([--enable-maxnodes=<nnodes>],
															
 
																 			[maximum number of memory nodes per MPI rank])],
															
@@ -2645,7 +2656,7 @@ if test "x$enable_build_fortran_requested" = "xyes" ; then
 
																 	fi
															
 
																 	if test "x$enable_build_fortran" = "xyes" ; then
															
 
																 		AC_DEFINE(STARPU_HAVE_FC, [1], [Define this if a Fortran compiler is available])
															
 
																-		if test x$build_mpi_lib = xyes -o x$build_mpi_master_slave = xyes ; then
															
 
																+		if test x$build_mpi_lib = xyes -o x$build_nmad_lib = xyes -o x$build_mpi_master_slave = xyes ; then
															
 
																 			#Check MPIFORT
															
 
																 			if test x$enable_simgrid = xyes ; then
															
 
																 				DEFAULT_MPIFORT=smpifort
															
@@ -3620,6 +3631,9 @@ AC_CONFIG_COMMANDS([executable-scripts], [
 
																   test -e tests/microbenchs/parallel_independent_homogeneous_tasks.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_independent_homogeneous_tasks.sh tests/microbenchs/
															
 
																   test -e tests/microbenchs/parallel_redux_homogeneous_tasks_data.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_redux_homogeneous_tasks_data.sh tests/microbenchs/
															
 
																   test -e tests/microbenchs/parallel_redux_heterogeneous_tasks_data.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_redux_heterogeneous_tasks_data.sh tests/microbenchs/
															
 
																+  mkdir -p tests/energy
															
 
																+  test -e tests/energy/static.sh || ln -sf $ac_abs_top_srcdir/tests/energy/static.sh tests/energy/
															
 
																+  test -e tests/energy/dynamic.sh || ln -sf $ac_abs_top_srcdir/tests/energy/dynamic.sh tests/energy/
															
 
																   mkdir -p tests/datawizard
															
 
																   test -e tests/datawizard/locality.sh || ln -sf $ac_abs_top_srcdir/tests/datawizard/locality.sh tests/datawizard/
															
 
																   mkdir -p tests/overlap
															
--- a/doc/doxygen/Makefile.am
+++ b/doc/doxygen/Makefile.am
@@ -307,5 +307,5 @@ EXTRA_DIST += doxygen.cfg refman.tex \
 
																 # Rule to update documentation on web server. Should only be used locally.
															
 
																 PUBLISHHOST	?= gforge
															
 
																 update-web: $(DOX_PDF)
															
 
																-	scp -pr starpu.pdf html $(PUBLISHHOST):/home/groups/starpu/htdocs/doc
															
 
																+	scp -pr starpu.pdf html $(PUBLISHHOST):/home/groups/starpu/htdocs/files/doc
															
--- a/doc/doxygen/chapters/210_check_list_performance.doxy
+++ b/doc/doxygen/chapters/210_check_list_performance.doxy
@@ -91,6 +91,8 @@ operations to avoid this issue. For instance:
 
																 \code{.c}
															
 
																 func <<<grid,block,0,starpu_cuda_get_local_stream()>>> (foo, bar);
															
 
																+cudaError_t status = cudaGetLastError();
															
 
																+if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
															
 
																 cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																 \endcode
															
--- a/doc/doxygen/chapters/301_tasks.doxy
+++ b/doc/doxygen/chapters/301_tasks.doxy
@@ -118,7 +118,7 @@ to delay the termination of a task until the termination of other tasks.
 
																 \section SettingManyDataHandlesForATask Setting Many Data Handles For a Task
															
 
																-The maximum number of data a task can manage is fixed by the environment variable
															
 
																+The maximum number of data a task can manage is fixed by the macro
															
 
																 \ref STARPU_NMAXBUFS which has a default value which can be changed
															
 
																 through the \c configure option \ref enable-maxbuffers "--enable-maxbuffers".
															
--- a/doc/doxygen/chapters/320_scheduling.doxy
+++ b/doc/doxygen/chapters/320_scheduling.doxy
@@ -205,7 +205,9 @@ simply tend to run all computations on the most energy-conservative processing
 
																 unit. To account for the consumption of the whole machine (including idle
															
 
																 processing units), the idle power of the machine should be given by setting
															
 
																 <c>export STARPU_IDLE_POWER=200</c> (\ref STARPU_IDLE_POWER) for 200W, for instance. This value can often
															
 
																-be obtained from the machine power supplier.
															
 
																+be obtained from the machine power supplier, e.g. by running
															
 
																+
															
 
																+<c>ipmitool -I lanplus -H mymachine-ipmi -U myuser -P mypasswd sdr type Current</c>
															
 
																 The energy actually consumed by the total execution can be displayed by setting
															
 
																 <c>export STARPU_PROFILING=1 STARPU_WORKER_STATS=1</c> (\ref STARPU_PROFILING and \ref STARPU_WORKER_STATS).
															
--- a/doc/doxygen/chapters/350_scheduling_policy_definition.doxy
+++ b/doc/doxygen/chapters/350_scheduling_policy_definition.doxy
@@ -60,6 +60,9 @@ queue the transfers on the idle prefetch queue, which is only processed when
 
																 there are no non-idle prefetch to process.
															
 
																 starpu_get_prefetch_flag() is a convenient helper for checking the value of the 
															
 
																 \ref STARPU_PREFETCH environment variable.
															
 
																+When a scheduler does such prefetching, it should set the <c>prefetches</c>
															
 
																+field of the <c>starpu_sched_policy</c> to 1, to prevent the core from
															
 
																+triggering its own prefetching.
															
 
																 Usual functions can be used on tasks, for instance one can use the following to
															
 
																 get the data size for a task.
															
--- a/doc/doxygen/chapters/510_configure_options.doxy
+++ b/doc/doxygen/chapters/510_configure_options.doxy
@@ -527,6 +527,15 @@ Define the maximum number of buffers that tasks will be able to take
 
																 as parameters, then available as the macro ::STARPU_NMAXBUFS.
															
 
																 </dd>
															
 
																+<dt>--enable-fxt-max-files=<c>count</c></dt>
															
 
																+<dd>
															
 
																+\anchor enable-fxt-max-files
															
 
																+\addindex __configure__--enable-fxt-max-files
															
 
																+Use at most <c>count</c> mpi nodes fxt files for generating traces.  This information is then available as
															
 
																+the macro ::STARPU_FXT_MAX_FILES.  This information is used by FxT tools when considering multi node traces.
															
 
																+Default value is 64.
															
 
																+</dd>
															
 
																+
															
 
																 <dt>--enable-allocation-cache</dt>
															
 
																 <dd>
															
 
																 \anchor enable-allocation-cache
															
--- a/doc/doxygen/chapters/code/vector_scal_cuda.c
+++ b/doc/doxygen/chapters/code/vector_scal_cuda.c
@@ -35,6 +35,8 @@ extern "C" void scal_cuda_func(void *buffers[], void *_args)
 
																         unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
															
 
																         vector_mult_cuda<<<nblocks,threads_per_block, 0, starpu_cuda_get_local_stream()>>>(n, val, *factor);
															
 
																+        cudaError_t status = cudaGetLastError();
															
 
																+        if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
															
 
																         cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																 }
															
--- a/doc/doxygen_dev/Makefile.am
+++ b/doc/doxygen_dev/Makefile.am
@@ -248,5 +248,5 @@ EXTRA_DIST += doxygen.cfg refman.tex \
 
																 # Rule to update documentation on web server. Should only be used locally.
															
 
																 PUBLISHHOST	?= gforge
															
 
																 update-web: $(DOX_PDF)
															
 
																-	scp -pr starpu_dev.pdf html_dev $(PUBLISHHOST):/home/groups/starpu/htdocs/doc
															
 
																+	scp -pr starpu_dev.pdf html_dev $(PUBLISHHOST):/home/groups/starpu/htdocs/files/doc
															
--- a/doc/tutorial/vector_scal_cuda.cu
+++ b/doc/tutorial/vector_scal_cuda.cu
@@ -35,6 +35,8 @@ extern "C" void vector_scal_cuda(void *buffers[], void *_args)
 
																         unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
															
 
																         vector_mult_cuda<<<nblocks,threads_per_block, 0, starpu_cuda_get_local_stream()>>>(val, n, *factor);
															
 
																+        cudaError_t status = cudaGetLastError();
															
 
																+        if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
															
 
																         cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																 }
															
--- a/examples/basic_examples/block_cuda.cu
+++ b/examples/basic_examples/block_cuda.cu
@@ -40,5 +40,7 @@ extern "C" void cuda_codelet(void *descr[], void *_args)
 
																         float *multiplier = (float *)_args;
															
 
																         cuda_block<<<1,1, 0, starpu_cuda_get_local_stream()>>>(block, nx, ny, nz, ldy, ldz, *multiplier);
															
 
																+        cudaError_t status = cudaGetLastError();
															
 
																+        if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
															
 
																 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																 }
															
--- a/examples/basic_examples/multiformat_conversion_codelets_cuda.cu
+++ b/examples/basic_examples/multiformat_conversion_codelets_cuda.cu
@@ -44,4 +44,6 @@ extern "C" void cpu_to_cuda_cuda_func(void *buffers[], void *_args)
 
																 	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
															
 
																         cpu_to_cuda_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>>(src, dst, n);
															
 
																+        cudaError_t status = cudaGetLastError();
															
 
																+        if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
															
 
																 }
															
--- a/examples/basic_examples/multiformat_cuda.cu
+++ b/examples/basic_examples/multiformat_cuda.cu
@@ -39,6 +39,8 @@ extern "C" void multiformat_scal_cuda_func(void *buffers[], void *_args)
 
																 	unsigned threads_per_block = 64;
															
 
																 	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
															
 
																         multiformat_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>>(soa, n);
															
 
																+        cudaError_t status = cudaGetLastError();
															
 
																+        if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
															
 
																 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																 }
															
--- a/examples/basic_examples/variable_kernels.cu
+++ b/examples/basic_examples/variable_kernels.cu
@@ -27,5 +27,7 @@ extern "C" void cuda_codelet(void *descr[], void *_args)
 
																 	float *val = (float *)STARPU_VARIABLE_GET_PTR(descr[0]);
															
 
																 	cuda_variable<<<1,1, 0, starpu_cuda_get_local_stream()>>>(val);
															
 
																+	cudaError_t status = cudaGetLastError();
															
 
																+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
															
 
																 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																 }
															
--- a/examples/basic_examples/vector_scal_cuda.cu
+++ b/examples/basic_examples/vector_scal_cuda.cu
@@ -41,4 +41,6 @@ extern "C" void scal_cuda_func(void *buffers[], void *_args)
 
																 	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
															
 
																         vector_mult_cuda<<<nblocks,threads_per_block,0,starpu_cuda_get_local_stream()>>>(n, val, *factor);
															
 
																+        cudaError_t status = cudaGetLastError();
															
 
																+        if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
															
 
																 }
															
--- a/examples/filters/custom_mf/conversion.cu
+++ b/examples/filters/custom_mf/conversion.cu
@@ -45,4 +45,6 @@ extern "C" void cpu_to_cuda_cuda_func(void *buffers[], void *_args)
 
																 	unsigned threads_per_block = 64;
															
 
																 	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
															
 
																         custom_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>>(aop, n, x, y);
															
 
																+        cudaError_t status = cudaGetLastError();
															
 
																+        if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
															
 
																 }
															
--- a/examples/filters/custom_mf/cuda.cu
+++ b/examples/filters/custom_mf/cuda.cu
@@ -39,4 +39,6 @@ extern "C" void custom_scal_cuda_func(void *buffers[], void *_args)
 
																 	unsigned threads_per_block = 64;
															
 
																 	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
															
 
																         scal_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>>(n, x, y);
															
 
																+        cudaError_t status = cudaGetLastError();
															
 
																+        if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
															
 
																 }
															
--- a/examples/filters/fblock_cuda.cu
+++ b/examples/filters/fblock_cuda.cu
@@ -43,4 +43,6 @@ extern "C" void cuda_func(void *buffers[], void *_args)
 
																         /* TODO: use more blocks and threads in blocks */
															
 
																         fblock_cuda<<<1,1, 0, starpu_cuda_get_local_stream()>>>(block, nx, ny, nz, ldy, ldz, *factor);
															
 
																+        cudaError_t status = cudaGetLastError();
															
 
																+        if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
															
 
																 }
															
--- a/examples/filters/fmultiple_cuda.cu
+++ b/examples/filters/fmultiple_cuda.cu
@@ -44,6 +44,8 @@ extern "C" void fmultiple_check_scale_cuda(void *buffers[], void *cl_arg)
 
																         /* TODO: use more vals and threads in vals */
															
 
																 	_fmultiple_check_scale_cuda<<<1,1, 0, starpu_cuda_get_local_stream()>>>(val, nx, ny, ld, start, factor);
															
 
																+	cudaError_t status = cudaGetLastError();
															
 
																+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
															
 
																 }
															
 
																 static __global__ void _fmultiple_check_cuda(int *val, int nx, int ny, unsigned ld, int start, int factor)
															
@@ -71,4 +73,6 @@ extern "C" void fmultiple_check_cuda(void *buffers[], void *cl_arg)
 
																         /* TODO: use more vals and threads in vals */
															
 
																 	_fmultiple_check_cuda<<<1,1, 0, starpu_cuda_get_local_stream()>>>(val, nx, ny, ld, start, factor);
															
 
																+	cudaError_t status = cudaGetLastError();
															
 
																+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
															
 
																 }
															
--- a/examples/incrementer/incrementer_kernels.cu
+++ b/examples/incrementer/incrementer_kernels.cu
@@ -32,4 +32,6 @@ extern "C" void cuda_codelet(void *descr[], void *_args)
 
																 	float *val = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
															
 
																 	cuda_incrementer<<<1,1, 0, starpu_cuda_get_local_stream()>>>(val);
															
 
																+	cudaError_t status = cudaGetLastError();
															
 
																+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
															
 
																 }
															
--- a/examples/interface/complex_kernels.cu
+++ b/examples/interface/complex_kernels.cu
@@ -44,4 +44,6 @@ extern "C" void copy_complex_codelet_cuda(void *descr[], void *_args)
 
																 	unsigned nblocks = (nx + threads_per_block-1) / threads_per_block;
															
 
																         complex_copy_cuda<<<nblocks, threads_per_block, 0, starpu_cuda_get_local_stream()>>>(o_real, o_imaginary, i_real, i_imaginary, nx);
															
 
																+        cudaError_t status = cudaGetLastError();
															
 
																+        if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
															
 
																 }
															
--- a/examples/mult/double.h
+++ b/examples/mult/double.h
@@ -15,6 +15,7 @@
 
																  */
															
 
																 #define TYPE	double
															
 
																+#define EPSILON	0.000000000001
															
 
																 #define CUBLAS_GEMM cublasDgemm
															
 
																 #define CPU_GEMM	STARPU_DGEMM
															
--- a/examples/mult/simple.h
+++ b/examples/mult/simple.h
@@ -15,6 +15,7 @@
 
																  */
															
 
																 #define TYPE	float
															
 
																+#define EPSILON	0.000001
															
 
																 #define CUBLAS_GEMM cublasSgemm
															
 
																 #define CPU_GEMM	STARPU_SGEMM
															
--- a/examples/mult/xgemm.c
+++ b/examples/mult/xgemm.c
@@ -75,7 +75,7 @@ static int check_output(void)
 
																 	TYPE err;
															
 
																 	err = CPU_ASUM(xdim*ydim, C, 1);
															
 
																-	if (err < xdim*ydim*0.001)
															
 
																+	if (err < EPSILON*xdim*ydim*zdim)
															
 
																 	{
															
 
																 		FPRINTF(stderr, "Results are OK\n");
															
 
																 		return 0;
															
--- a/examples/pi/SobolQRNG/sobol_gpu.cu
+++ b/examples/pi/SobolQRNG/sobol_gpu.cu
@@ -165,4 +165,6 @@ extern "C" void sobolGPU(int n_vectors, int n_dimensions, unsigned int *d_direct
 
																     // Execute GPU kernel
															
 
																     sobolGPU_kernel<<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>>(n_vectors, n_dimensions, d_directions, d_output);
															
 
																+    cudaError_t status = cudaGetLastError();
															
 
																+    if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
															
 
																 }
															
--- a/examples/pi/pi_kernel.cu
+++ b/examples/pi/pi_kernel.cu
@@ -137,12 +137,16 @@ extern "C" void cuda_kernel(void *descr[], void *cl_arg)
 
																 	/* each entry of per_block_cnt contains the number of successful shots
															
 
																 	 * in the corresponding block. */
															
 
																 	monte_carlo<<<nblocks, nthread_per_block, 0, starpu_cuda_get_local_stream()>>>(random_numbers_x, random_numbers_y, nx, per_block_cnt);
															
 
																+	cures = cudaGetLastError();
															
 
																+	if (cures != cudaSuccess) STARPU_CUDA_REPORT_ERROR(cures);
															
 
																 	/* Note that we do not synchronize between kernel calls because there is an implicit serialization */
															
 
																 	/* compute the total number of successful shots by adding the elements
															
 
																 	 * of the per_block_cnt array */
															
 
																 	sum_per_block_cnt<<<1, nblocks, 0, starpu_cuda_get_local_stream()>>>(per_block_cnt, cnt);
															
 
																+	cures = cudaGetLastError();
															
 
																+	if (cures != cudaSuccess) STARPU_CUDA_REPORT_ERROR(cures);
															
 
																 	cures = cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																 	if (cures)
															
 
																 		STARPU_CUDA_REPORT_ERROR(cures);
															
--- a/examples/pi/pi_redux_kernel.cu
+++ b/examples/pi/pi_redux_kernel.cu
@@ -115,12 +115,16 @@ extern "C" void pi_redux_cuda_kernel(float *x, float *y, unsigned n, unsigned lo
 
																 	/* each entry of per_block_cnt contains the number of successful shots
															
 
																 	 * in the corresponding block. */
															
 
																 	monte_carlo<<<nblocks, nthread_per_block, 0, starpu_cuda_get_local_stream()>>>(x, y, n, per_block_cnt);
															
 
																+	cures = cudaGetLastError();
															
 
																+	if (cures != cudaSuccess) STARPU_CUDA_REPORT_ERROR(cures);
															
 
																 	/* Note that we do not synchronize between kernel calls because there is an implicit serialization */
															
 
																 	/* compute the total number of successful shots by adding the elements
															
 
																 	 * of the per_block_cnt array */
															
 
																 	sum_per_block_cnt<<<1, nblocks, 0, starpu_cuda_get_local_stream()>>>(per_block_cnt, shot_cnt);
															
 
																+	cures = cudaGetLastError();
															
 
																+	if (cures != cudaSuccess) STARPU_CUDA_REPORT_ERROR(cures);
															
 
																 	cures = cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																 	if (cures)
															
 
																 		STARPU_CUDA_REPORT_ERROR(cures);
															
--- a/examples/reductions/dot_product_kernels.cu
+++ b/examples/reductions/dot_product_kernels.cu
@@ -33,4 +33,6 @@ extern "C" void redux_cuda_func(void *descr[], void *_args)
 
																 	DOT_TYPE *dotb = (DOT_TYPE *)STARPU_VARIABLE_GET_PTR(descr[1]);
															
 
																 	cuda_redux<<<1,1, 0, starpu_cuda_get_local_stream()>>>(dota, dotb);
															
 
																+	cudaError_t status = cudaGetLastError();
															
 
																+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
															
 
																 }
															
--- a/examples/sched_ctx/axpy_partition_gpu.cu
+++ b/examples/sched_ctx/axpy_partition_gpu.cu
@@ -73,4 +73,6 @@ extern "C" void cuda_axpy(void *descr[], void *_args)
 
																 	__P_HOSTSETUP(saxpy_partitioned,dim3(dimensions,1,1),dimensions,0,SM_mapping_start,SM_allocation,stream);
															
 
																   	saxpy_partitioned<<<width,dimensions,0,stream>>>(__P_HKARGS,n,a,x,y);
															
 
																+	cudaError_t status = cudaGetLastError();
															
 
																+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
															
 
																 }
															
--- a/examples/spmv/spmv_cuda.cu
+++ b/examples/spmv/spmv_cuda.cu
@@ -97,6 +97,8 @@ extern "C" void spmv_kernel_cuda(void *descr[], void *args)
 
																 	spmv_kernel_3<<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>>
															
 
																 		(nnz, nrow, nzval, colind, rowptr, firstentry, vecin, nx_in, vecout, nx_out);
															
 
																+	cudaError_t status = cudaGetLastError();
															
 
																+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
															
 
																 }
															
--- a/examples/stencil/life_cuda.cu
+++ b/examples/stencil/life_cuda.cu
@@ -73,4 +73,6 @@ extern "C" void cuda_life_update_host(int bz, const TYPE *old, TYPE *newp, int n
 
																 	dim3 dimGrid((nx + threads_per_dim_x-1) / threads_per_dim_x, (ny + threads_per_dim_y-1) / threads_per_dim_y);
															
 
																 #endif
															
 
																 	cuda_life_update <<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>> (bz, old, newp, nx, ny, nz, ldy, ldz, iter);
															
 
																+	cudaError_t status = cudaGetLastError();
															
 
																+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
															
 
																 }
															
--- a/examples/stencil/shadow.cu
+++ b/examples/stencil/shadow.cu
@@ -53,4 +53,6 @@ extern "C" void cuda_shadow_host(int bz, TYPE *ptr, int nx, int ny, int nz, int
 
																 	dim3 dimGrid((nx + threads_per_dim_x-1) / threads_per_dim_x, (ny + threads_per_dim_y-1) / threads_per_dim_y);
															
 
																 #endif
															
 
																 	cuda_shadow <<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>> (bz, ptr, nx, ny, nz, ldy, ldz, i);
															
 
																+	cudaError_t status = cudaGetLastError();
															
 
																+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
															
 
																 }
															
--- a/include/starpu_config.h.in
+++ b/include/starpu_config.h.in
@@ -187,6 +187,15 @@
 
																 #undef STARPU_NMAXBUFS
															
 
																 /**
															
 
																+   Define the maximum number of fxt mpi files that can be read when
															
 
																+   generating traces. The default value is 64, it can be changed by
															
 
																+   using the configure option \ref enable-fxt-max-files
															
 
																+   "--enable-fxt-max-files".
															
 
																+   @ingroup API_MPI_Support
															
 
																+*/
															
 
																+#undef STARPU_FXT_MAX_FILES
															
 
																+
															
 
																+/**
															
 
																    Define the maximum number of CPU workers managed by StarPU. The
															
 
																    default value can be modified at configure by using the option \ref
															
 
																    enable-maxcpus "--enable-maxcpus".
															
--- a/include/starpu_fxt.h
+++ b/include/starpu_fxt.h
@@ -20,6 +20,7 @@
 
																 #ifndef __STARPU_FXT_H__
															
 
																 #define __STARPU_FXT_H__
															
 
																+#include <starpu_config.h>
															
 
																 #include <starpu_perfmodel.h>
															
 
																 #ifdef __cplusplus
															
@@ -32,8 +33,6 @@ extern "C"
 
																    @{
															
 
																 */
															
 
																-#define STARPU_FXT_MAX_FILES	64
															
 
																-
															
 
																 struct starpu_fxt_codelet_event
															
 
																 {
															
 
																 	char symbol[256];
															
--- a/include/starpu_helper.h
+++ b/include/starpu_helper.h
@@ -20,6 +20,10 @@
 
																 #include <stdio.h>
															
 
																 #include <starpu.h>
															
 
																+#ifdef STARPU_HAVE_HWLOC
															
 
																+#include <hwloc.h>
															
 
																+#endif
															
 
																+
															
 
																 #ifdef __cplusplus
															
 
																 extern "C"
															
 
																 {
															
@@ -190,6 +194,19 @@ int starpu_data_cpy(starpu_data_handle_t dst_handle, starpu_data_handle_t src_ha
 
																 */
															
 
																 void starpu_display_bindings(void);
															
 
																+/**
															
 
																+   If \c hwloc is used, convert the given \p logical_index of a PU to the OS
															
 
																+   index of this PU. If \c hwloc is not used, return \p logical_index.
															
 
																+*/
															
 
																+int starpu_get_pu_os_index(unsigned logical_index);
															
 
																+
															
 
																+#ifdef STARPU_HAVE_HWLOC
															
 
																+/**
															
 
																+   Get the hwloc topology used by StarPU. One can use this pointer to get
															
 
																+   information about topology, but not to change settings related to topology.
															
 
																+*/
															
 
																+hwloc_topology_t starpu_get_hwloc_topology(void);
															
 
																+#endif
															
 
																 /** @} */
															
 
																 #ifdef __cplusplus
															
--- a/include/starpu_scheduler.h
+++ b/include/starpu_scheduler.h
@@ -186,6 +186,11 @@ struct starpu_sched_policy
 
																 	*/
															
 
																 	void (*remove_workers)(unsigned sched_ctx_id, int *workerids, unsigned nworkers);
															
 
																+	/** Whether this scheduling policy does data prefetching, and thus the
															
 
																+	    core should not try to do it opportunistically.
															
 
																+	*/
															
 
																+	int prefetches;
															
 
																+
															
 
																 	/**
															
 
																 	   Optional field. Name of the policy.
															
 
																 	*/
															
--- a/include/starpu_stdlib.h
+++ b/include/starpu_stdlib.h
@@ -239,9 +239,32 @@ void starpu_memory_deallocate(unsigned node, size_t size);
 
																 */
															
 
																 void starpu_memory_wait_available(unsigned node, size_t size);
															
 
																+/**
															
 
																+   Sleep for the given \p nb_sec seconds.
															
 
																+   In simgrid mode, this only sleeps within virtual time.
															
 
																+  */
															
 
																 void starpu_sleep(float nb_sec);
															
 
																+
															
 
																+/**
															
 
																+   Sleep for the given \p nb_micro_sec micro-seconds.
															
 
																+   In simgrid mode, this only sleeps within virtual time.
															
 
																+  */
															
 
																 void starpu_usleep(float nb_micro_sec);
															
 
																+/**
															
 
																+   Account for \p joules J being used.
															
 
																+   This is support in simgrid mode, to record how much energy was used, and will
															
 
																+   show up in further call to starpu_energy_used().
															
 
																+  */
															
 
																+void starpu_energy_use(float joules);
															
 
																+
															
 
																+/**
															
 
																+   Return the amount of energy having been used in J.
															
 
																+   This account the amounts passed to starpu_energy_use(), but also the static
															
 
																+   energy use set by the \ref STARPU_IDLE_POWER environment variable.
															
 
																+  */
															
 
																+double starpu_energy_used(void);
															
 
																+
															
 
																 /** @} */
															
 
																 #ifdef __cplusplus
															
--- a/include/starpu_task.h
+++ b/include/starpu_task.h
@@ -563,7 +563,7 @@ struct starpu_codelet
 
																 	/**
															
 
																 	   Optional pointer to the task energy consumption performance
															
 
																-	   model associated to this codelet. This optional field is
															
 
																+	   model associated to this codelet (in J). This optional field is
															
 
																 	   ignored when set to <c>NULL</c> or when its field
															
 
																 	   starpu_perfmodel::symbol is not set. In the case of
															
 
																 	   parallel codelets, this has to account for all processing
															
--- a/julia/examples/mult/gpu_mult.cu
+++ b/julia/examples/mult/gpu_mult.cu
@@ -79,6 +79,8 @@ extern "C" void gpu_mult(void * descr[], void * args)
 
																 	gpuMultKernel
															
 
																 		<<< nblocks, THREADS_PER_BLOCK, 0, NULL /*starpu_cuda_get_local_stream()*/
															
 
																 		>>> (nxC, nyC, nyA, ldA, ldB, ldC, d_subA, d_subB, d_subC);
															
 
																+	cudaError_t status = cudaGetLastError();
															
 
																+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
															
 
																 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
--- a/julia/examples/old_examples/gpu_mult.cu
+++ b/julia/examples/old_examples/gpu_mult.cu
@@ -78,6 +78,8 @@ extern "C" void gpu_mult(void * descr[], void * args)
 
																 	gpuMultKernel
															
 
																 		<<< nblocks, THREADS_PER_BLOCK, 0, starpu_cuda_get_local_stream()
															
 
																 		>>> (nxC, nyC, nyA, ldA, ldB, ldC, d_subA, d_subB, d_subC);
															
 
																+	cudaError_t status = cudaGetLastError();
															
 
																+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
															
 
																 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
--- a/julia/examples/old_examples/mandelbrot/gpu_mandelbrot.cu
+++ b/julia/examples/old_examples/mandelbrot/gpu_mandelbrot.cu
@@ -106,6 +106,8 @@ extern "C" void gpu_mandelbrot(void *descr[], void *args)
 
																   nblocks = (nxP * nyP + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK;
															
 
																   gpuMandelbrotKernel <<< nblocks, THREADS_PER_BLOCK, 0, starpu_cuda_get_local_stream() >>> (nxP, nyP, ldP, d_subP, *params);
															
 
																+  cudaError_t status = cudaGetLastError();
															
 
																+  if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
															
 
																   cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																 }
															
--- a/julia/examples/old_examples/mandelbrot/gpu_mandelbrot_between.cu
+++ b/julia/examples/old_examples/mandelbrot/gpu_mandelbrot_between.cu
@@ -123,6 +123,8 @@ extern "C" void CUDA_mandelbrot(void** buffers_uwrYFDVe, void* cl_arg_uwrYFDVe)
 
																              ptr_qoUGBRtY, local_height, conv_limit, ptr_A5zD9sJZ, 
															
 
																              ld_A5zD9sJZ);
															
 
																     ;
															
 
																+    cudaError_t status = cudaGetLastError();
															
 
																+    if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
															
 
																     cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																 }
															
--- a/julia/examples/old_examples/mult/gpu_mult.cu
+++ b/julia/examples/old_examples/mult/gpu_mult.cu
@@ -78,6 +78,8 @@ extern "C" void gpu_mult(void * descr[], void * args)
 
																 	gpuMultKernel
															
 
																 		<<< nblocks, THREADS_PER_BLOCK, 0, starpu_cuda_get_local_stream()
															
 
																 		>>> (nxC, nyC, nyA, ldA, ldB, ldC, d_subA, d_subB, d_subC);
															
 
																+	cudaError_t status = cudaGetLastError();
															
 
																+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
															
 
																 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
--- a/julia/examples/old_examples/nbody/gpu_nbody.cu
+++ b/julia/examples/old_examples/nbody/gpu_nbody.cu
@@ -94,6 +94,8 @@ extern "C" void gpu_nbody(void * descr[], void * args)
 
																   gpuNbodyKernel
															
 
																     <<< nblocks, THREADS_PER_BLOCK, 0, starpu_cuda_get_local_stream()
															
 
																     >>> (d_P,  d_subA, d_M, nxP, nxA, nxM, ldP, ldA, *params);
															
 
																+  cudaError_t status = cudaGetLastError();
															
 
																+  if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
															
 
																   cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
@@ -156,6 +158,8 @@ extern "C" void gpu_nbody2(void * descr[], void *args)
 
																   gpuNbody2Kernel
															
 
																     <<< nblocks, THREADS_PER_BLOCK, 0, starpu_cuda_get_local_stream()
															
 
																     >>> (d_subP, d_subV, d_subA, nxP, nxV, nxA, ldP, ldV, ldA, *params);
															
 
																+  cudaError_t status = cudaGetLastError();
															
 
																+  if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
															
 
																   cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																 }
															
--- a/julia/examples/old_examples/nbody/gpu_nbody_between.cu
+++ b/julia/examples/old_examples/nbody/gpu_nbody_between.cu
@@ -161,6 +161,8 @@ extern "C" void CUDA_nbody_updt(void** buffers_gj6UYWT4, void* cl_arg_gj6UYWT4)
 
																              ld_jJ5f8wMA, ptr_piPvdbTs, ld_piPvdbTs, ptr_JBaPgPiT, 
															
 
																              ptr_0STm2S4k, ld_0STm2S4k);
															
 
																     ;
															
 
																+    cudaError_t status = cudaGetLastError();
															
 
																+    if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
															
 
																     cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																 }
															
--- a/julia/src/compiler/expressions.jl
+++ b/julia/src/compiler/expressions.jl
@@ -335,6 +335,10 @@ function print(io :: IO, expr :: StarpuExprCudaCall ; indent = 0,restrict=false)
 
																     print(io, ");")
															
 
																     print_newline(io, indent)
															
 
																+    print(io, "cudaError_t status = cudaGetLastError();")
															
 
																+    print_newline(io, indent)
															
 
																+    print(io, "if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);")
															
 
																+    print_newline(io, indent)
															
 
																 end
															
--- a/mpi/examples/matrix_decomposition/mpi_cholesky.c
+++ b/mpi/examples/matrix_decomposition/mpi_cholesky.c
@@ -58,7 +58,7 @@ int main(int argc, char **argv)
 
																 #ifndef STARPU_SIMGRID
															
 
																 	matrix_display(bmat, rank);
															
 
																-	if (check)
															
 
																+	if (check && rank == 0)
															
 
																 		dw_cholesky_check_computation(bmat, rank, nodes, &correctness, &flops, 0.001);
															
 
																 #endif
															
--- a/mpi/examples/matrix_decomposition/mpi_cholesky_codelets.c
+++ b/mpi/examples/matrix_decomposition/mpi_cholesky_codelets.c
@@ -68,6 +68,212 @@ static struct starpu_codelet cl22 =
 
																 	.color = 0x00ff00,
															
 
																 };
															
 
																+static void run_cholesky(starpu_data_handle_t **data_handles, int rank, int nodes)
															
 
																+{
															
 
																+	unsigned k, m, n;
															
 
																+	unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
															
 
																+
															
 
																+	for (k = 0; k < nblocks; k++)
															
 
																+	{
															
 
																+		starpu_iteration_push(k);
															
 
																+
															
 
																+		starpu_mpi_task_insert(MPI_COMM_WORLD, &cl11,
															
 
																+				       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k) : STARPU_MAX_PRIO,
															
 
																+				       STARPU_RW, data_handles[k][k],
															
 
																+				       0);
															
 
																+
															
 
																+		for (m = k+1; m<nblocks; m++)
															
 
																+		{
															
 
																+			starpu_mpi_task_insert(MPI_COMM_WORLD, &cl21,
															
 
																+					       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
															
 
																+					       STARPU_R, data_handles[k][k],
															
 
																+					       STARPU_RW, data_handles[m][k],
															
 
																+					       0);
															
 
																+
															
 
																+			starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[k][k]);
															
 
																+			if (my_distrib(k, k, nodes) == rank)
															
 
																+				starpu_data_wont_use(data_handles[k][k]);
															
 
																+
															
 
																+			for (n = k+1; n<nblocks; n++)
															
 
																+			{
															
 
																+				if (n <= m)
															
 
																+				{
															
 
																+					starpu_mpi_task_insert(MPI_COMM_WORLD, &cl22,
															
 
																+							       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m - n) : ((n == k+1) && (m == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
															
 
																+							       STARPU_R, data_handles[n][k],
															
 
																+							       STARPU_R, data_handles[m][k],
															
 
																+							       STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
															
 
																+							       0);
															
 
																+				}
															
 
																+			}
															
 
																+
															
 
																+			starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[m][k]);
															
 
																+			if (my_distrib(m, k, nodes) == rank)
															
 
																+				starpu_data_wont_use(data_handles[m][k]);
															
 
																+		}
															
 
																+		starpu_iteration_pop();
															
 
																+	}
															
 
																+}
															
 
																+
															
 
																+/* TODO: generated from compiler polyhedral analysis of classical algorithm */
															
 
																+static void run_cholesky_column(starpu_data_handle_t **data_handles, int rank, int nodes)
															
 
																+{
															
 
																+	unsigned k, m, n;
															
 
																+	unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
															
 
																+
															
 
																+	/* Column */
															
 
																+	for (n = 0; n<nblocks; n++)
															
 
																+	{
															
 
																+		starpu_iteration_push(n);
															
 
																+
															
 
																+		/* Row */
															
 
																+		for (m = n; m<nblocks; m++)
															
 
																+		{
															
 
																+			for (k = 0; k < n; k++)
															
 
																+			{
															
 
																+				/* Accumulate updates from TRSMs */
															
 
																+				starpu_mpi_task_insert(MPI_COMM_WORLD, &cl22,
															
 
																+						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m - n) : ((n == k+1) && (m == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
															
 
																+						       STARPU_R, data_handles[n][k],
															
 
																+						       STARPU_R, data_handles[m][k],
															
 
																+						       STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
															
 
																+						       0);
															
 
																+			}
															
 
																+			k = n;
															
 
																+			if (m > n)
															
 
																+			{
															
 
																+				/* non-diagonal block, solve */
															
 
																+				starpu_mpi_task_insert(MPI_COMM_WORLD, &cl21,
															
 
																+						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
															
 
																+						       STARPU_R, data_handles[k][k],
															
 
																+						       STARPU_RW, data_handles[m][k],
															
 
																+						       0);
															
 
																+			}
															
 
																+			else
															
 
																+			{
															
 
																+				/* diagonal block, factorize */
															
 
																+				starpu_mpi_task_insert(MPI_COMM_WORLD, &cl11,
															
 
																+						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k) : STARPU_MAX_PRIO,
															
 
																+						       STARPU_RW, data_handles[k][k],
															
 
																+						       0);
															
 
																+			}
															
 
																+		}
															
 
																+
															
 
																+		starpu_iteration_pop();
															
 
																+	}
															
 
																+
															
 
																+	/* Submit flushes, StarPU will fit them according to the progress */
															
 
																+	starpu_mpi_cache_flush_all_data(MPI_COMM_WORLD);
															
 
																+	for (m = 0; m < nblocks; m++)
															
 
																+		for (n = 0; n < nblocks ; n++)
															
 
																+			starpu_data_wont_use(data_handles[m][n]);
															
 
																+}
															
 
																+
															
 
																+/* TODO: generated from compiler polyhedral analysis of classical algorithm */
															
 
																+static void run_cholesky_antidiagonal(starpu_data_handle_t **data_handles, int rank, int nodes)
															
 
																+{
															
 
																+	unsigned a, b, c;
															
 
																+	unsigned k, m, n;
															
 
																+	unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
															
 
																+
															
 
																+	/* double-antidiagonal number:
															
 
																+	 * - a=0 contains (0,0) plus (1,0)
															
 
																+	 * - a=1 contains (2,0), (1,1) plus (3,0), (2, 1)
															
 
																+	 * - etc.
															
 
																+	 */
															
 
																+	for (a = 0; a < nblocks; a++)
															
 
																+	{
															
 
																+		starpu_iteration_push(a);
															
 
																+
															
 
																+		unsigned bfirst;
															
 
																+		if (2*a < nblocks)
															
 
																+			bfirst = 0;
															
 
																+		else
															
 
																+			bfirst = 2*a - (nblocks-1);
															
 
																+
															
 
																+		/* column within first antidiagonal for a */
															
 
																+		for (b = bfirst; b <= a; b++)
															
 
																+		{
															
 
																+			/* column */
															
 
																+			n = b;
															
 
																+			/* row */
															
 
																+			m = 2*a-b;
															
 
																+
															
 
																+			/* Accumulate updates from TRSMs */
															
 
																+			for (c = 0; c < n; c++)
															
 
																+			{
															
 
																+				k = c;
															
 
																+				starpu_mpi_task_insert(MPI_COMM_WORLD, &cl22,
															
 
																+						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m - n) : ((n == k+1) && (m == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
															
 
																+						       STARPU_R, data_handles[n][k],
															
 
																+						       STARPU_R, data_handles[m][k],
															
 
																+						       STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
															
 
																+						       0);
															
 
																+			}
															
 
																+
															
 
																+			if (b < a)
															
 
																+			{
															
 
																+				/* non-diagonal block, solve */
															
 
																+				k = n;
															
 
																+				starpu_mpi_task_insert(MPI_COMM_WORLD, &cl21,
															
 
																+						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
															
 
																+						       STARPU_R, data_handles[k][k],
															
 
																+						       STARPU_RW, data_handles[m][k],
															
 
																+						       0);
															
 
																+			}
															
 
																+			else
															
 
																+			{
															
 
																+				/* diagonal block, factorize */
															
 
																+				k = a;
															
 
																+				starpu_mpi_task_insert(MPI_COMM_WORLD, &cl11,
															
 
																+						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k) : STARPU_MAX_PRIO,
															
 
																+						       STARPU_RW, data_handles[k][k],
															
 
																+						       0);
															
 
																+			}
															
 
																+		}
															
 
																+
															
 
																+		/* column within second antidiagonal for a */
															
 
																+		for (b = bfirst; b <= a; b++)
															
 
																+		{
															
 
																+			/* column */
															
 
																+			n = b;
															
 
																+			/* row */
															
 
																+			m = 2*a-b + 1;
															
 
																+
															
 
																+			if (m >= nblocks)
															
 
																+				/* Skip first item when even number of tiles */
															
 
																+				continue;
															
 
																+
															
 
																+			/* Accumulate updates from TRSMs */
															
 
																+			for (c = 0; c < n; c++)
															
 
																+			{
															
 
																+				k = c;
															
 
																+				starpu_mpi_task_insert(MPI_COMM_WORLD, &cl22,
															
 
																+						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m - n) : ((n == k+1) && (m == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
															
 
																+						       STARPU_R, data_handles[n][k],
															
 
																+						       STARPU_R, data_handles[m][k],
															
 
																+						       STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
															
 
																+						       0);
															
 
																+			}
															
 
																+			/* non-diagonal block, solve */
															
 
																+			k = n;
															
 
																+			starpu_mpi_task_insert(MPI_COMM_WORLD, &cl21,
															
 
																+					       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
															
 
																+					       STARPU_R, data_handles[k][k],
															
 
																+					       STARPU_RW, data_handles[m][k],
															
 
																+					       0);
															
 
																+		}
															
 
																+
															
 
																+		starpu_iteration_pop();
															
 
																+	}
															
 
																+
															
 
																+	/* Submit flushes, StarPU will fit them according to the progress */
															
 
																+	starpu_mpi_cache_flush_all_data(MPI_COMM_WORLD);
															
 
																+	for (m = 0; m < nblocks; m++)
															
 
																+		for (n = 0; n < nblocks ; n++)
															
 
																+			starpu_data_wont_use(data_handles[m][n]);
															
 
																+}
															
 
																+
															
 
																 /*
															
 
																  *	code to bootstrap the factorization
															
 
																  *	and construct the DAG
															
@@ -79,8 +285,6 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 
																 	starpu_data_handle_t **data_handles;
															
 
																 	unsigned k, m, n;
															
 
																-	unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
															
 
																-
															
 
																 	/* create all the DAG nodes */
															
 
																 	data_handles = malloc(nblocks*sizeof(starpu_data_handle_t *));
															
@@ -91,7 +295,7 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 
																 		for(n = 0; n < nblocks ; n++)
															
 
																 		{
															
 
																 			int mpi_rank = my_distrib(m, n, nodes);
															
 
																-			if (mpi_rank == rank)
															
 
																+			if (mpi_rank == rank || (check && rank == 0))
															
 
																 			{
															
 
																 				//fprintf(stderr, "[%d] Owning data[%d][%d]\n", rank, n, m);
															
 
																 				starpu_matrix_data_register(&data_handles[m][n], STARPU_MAIN_RAM, (uintptr_t)matA[m][n],
															
@@ -119,50 +323,16 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 
																 	starpu_mpi_barrier(MPI_COMM_WORLD);
															
 
																 	start = starpu_timing_now();
															
 
																-	for (k = 0; k < nblocks; k++)
															
 
																+	switch (submission)
															
 
																 	{
															
 
																-		starpu_iteration_push(k);
															
 
																-
															
 
																-		starpu_mpi_task_insert(MPI_COMM_WORLD, &cl11,
															
 
																-				       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k) : STARPU_MAX_PRIO,
															
 
																-				       STARPU_RW, data_handles[k][k],
															
 
																-				       0);
															
 
																-
															
 
																-		for (m = k+1; m<nblocks; m++)
															
 
																-		{
															
 
																-			starpu_mpi_task_insert(MPI_COMM_WORLD, &cl21,
															
 
																-					       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
															
 
																-					       STARPU_R, data_handles[k][k],
															
 
																-					       STARPU_RW, data_handles[m][k],
															
 
																-					       0);
															
 
																-
															
 
																-			starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[k][k]);
															
 
																-			if (my_distrib(k, k, nodes) == rank)
															
 
																-				starpu_data_wont_use(data_handles[k][k]);
															
 
																-
															
 
																-			for (n = k+1; n<nblocks; n++)
															
 
																-			{
															
 
																-				if (n <= m)
															
 
																-				{
															
 
																-					starpu_mpi_task_insert(MPI_COMM_WORLD, &cl22,
															
 
																-							       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m - n) : ((n == k+1) && (m == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
															
 
																-							       STARPU_R, data_handles[n][k],
															
 
																-							       STARPU_R, data_handles[m][k],
															
 
																-							       STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
															
 
																-							       0);
															
 
																-				}
															
 
																-			}
															
 
																-
															
 
																-			starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[m][k]);
															
 
																-			if (my_distrib(m, k, nodes) == rank)
															
 
																-				starpu_data_wont_use(data_handles[m][k]);
															
 
																-		}
															
 
																-		starpu_iteration_pop();
															
 
																+		case TRIANGLES:		run_cholesky(data_handles, rank, nodes); break;
															
 
																+		case COLUMNS:		run_cholesky_column(data_handles, rank, nodes); break;
															
 
																+		case ANTIDIAGONALS:	run_cholesky_antidiagonal(data_handles, rank, nodes); break;
															
 
																+		default: STARPU_ABORT();
															
 
																 	}
															
 
																 	starpu_mpi_wait_for_all(MPI_COMM_WORLD);
															
 
																 	starpu_mpi_barrier(MPI_COMM_WORLD);
															
 
																-
															
 
																 	end = starpu_timing_now();
															
 
																 	for (m = 0; m < nblocks; m++)
															
@@ -170,7 +340,7 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 
																 		for(n = 0; n < nblocks ; n++)
															
 
																 		{
															
 
																 			/* Get back data on node 0 for the check */
															
 
																-			if (check)
															
 
																+			if (check && data_handles[m][n])
															
 
																 				starpu_mpi_get_data_on_node(MPI_COMM_WORLD, data_handles[m][n], 0);
															
 
																 			if (data_handles[m][n])
															
@@ -248,24 +418,20 @@ void dw_cholesky_check_computation(float ***matA, int rank, int nodes, int *corr
 
																 	{
															
 
																 		for (m = 0; m < nblocks; m++)
															
 
																 		{
															
 
																-			int mpi_rank = my_distrib(m, n, nodes);
															
 
																-			if (mpi_rank == rank)
															
 
																+			for (nn = BLOCKSIZE*n ; nn < BLOCKSIZE*(n+1); nn++)
															
 
																 			{
															
 
																-				for (nn = (size/nblocks)*n ; nn < (size/nblocks)*n+(size/nblocks); nn++)
															
 
																+				for (mm = BLOCKSIZE*m ; mm < BLOCKSIZE*(m+1); mm++)
															
 
																 				{
															
 
																-					for (mm = (size/nblocks)*m ; mm < (size/nblocks)*m+(size/nblocks); mm++)
															
 
																+					if (nn <= mm)
															
 
																 					{
															
 
																-						if (nn <= mm)
															
 
																+						float orig = (1.0f/(1.0f+nn+mm)) + ((nn == mm)?1.0f*size:0.0f);
															
 
																+						float err = fabsf(test_mat[mm +nn*size] - orig) / orig;
															
 
																+						if (err > epsilon)
															
 
																 						{
															
 
																-							float orig = (1.0f/(1.0f+nn+mm)) + ((nn == mm)?1.0f*size:0.0f);
															
 
																-							float err = fabsf(test_mat[mm +nn*size] - orig) / orig;
															
 
																-							if (err > epsilon)
															
 
																-							{
															
 
																-								FPRINTF(stderr, "[%d] Error[%u, %u] --> %2.20f != %2.20f (err %2.20f)\n", rank, nn, mm, test_mat[mm +nn*size], orig, err);
															
 
																-								*correctness = 0;
															
 
																-								*flops = 0;
															
 
																-								break;
															
 
																-							}
															
 
																+							FPRINTF(stderr, "[%d] Error[%u, %u] --> %2.20f != %2.20f (err %2.20f)\n", rank, nn, mm, test_mat[mm +nn*size], orig, err);
															
 
																+							*correctness = 0;
															
 
																+							*flops = 0;
															
 
																+							break;
															
 
																 						}
															
 
																 					}
															
 
																 				}
															
--- a/mpi/examples/matrix_decomposition/mpi_decomposition_params.c
+++ b/mpi/examples/matrix_decomposition/mpi_decomposition_params.c
@@ -43,6 +43,7 @@ unsigned check = 0;
 
																 unsigned display = 0;
															
 
																 int dblockx = -1;
															
 
																 int dblocky = -1;
															
 
																+enum submission submission = TRIANGLES;
															
 
																 void parse_args(int argc, char **argv, int nodes)
															
 
																 {
															
@@ -79,6 +80,16 @@ void parse_args(int argc, char **argv, int nodes)
 
																                         nbigblocks = strtol(argv[++i], &argptr, 10);
															
 
																                 }
															
 
																+                if (strcmp(argv[i], "-columns") == 0)
															
 
																+                {
															
 
																+                        submission = COLUMNS;
															
 
																+                }
															
 
																+
															
 
																+                if (strcmp(argv[i], "-antidiagonals") == 0)
															
 
																+                {
															
 
																+                        submission = ANTIDIAGONALS;
															
 
																+                }
															
 
																+
															
 
																                 if (strcmp(argv[i], "-no-prio") == 0)
															
 
																                 {
															
 
																                         noprio = 1;
															
@@ -96,7 +107,7 @@ void parse_args(int argc, char **argv, int nodes)
 
																                 if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0)
															
 
																                 {
															
 
																-			printf("usage : %s [-size size] [-nblocks nblocks] [-no-prio] [-display] [-check]\n", argv[0]);
															
 
																+                        printf("usage : %s [-size size] [-nblocks nblocks] [-columns] [-antidiagonals] [-no-prio] [-display] [-check]\n", argv[0]);
															
 
																                 }
															
 
																         }
															
--- a/mpi/examples/matrix_decomposition/mpi_decomposition_params.h
+++ b/mpi/examples/matrix_decomposition/mpi_decomposition_params.h
@@ -28,6 +28,14 @@ extern unsigned display;
 
																 extern int dblockx;
															
 
																 extern int dblocky;
															
 
																+enum submission
															
 
																+{
															
 
																+	TRIANGLES,
															
 
																+	COLUMNS,
															
 
																+	ANTIDIAGONALS,
															
 
																+};
															
 
																+extern enum submission submission;
															
 
																+
															
 
																 void parse_args(int argc, char **argv, int nodes);
															
 
																 #endif // __MPI_CHOLESKY_PARAMS_H__
															
--- a/mpi/include/starpu_mpi.h
+++ b/mpi/include/starpu_mpi.h
@@ -132,6 +132,12 @@ int starpu_mpi_world_size(void);
 
																 */
															
 
																 int starpu_mpi_comm_get_attr(MPI_Comm comm, int keyval, void *attribute_val, int *flag);
															
 
																+
															
 
																+/**
															
 
																+   Get the logical index of the core where the MPI thread is bound.
															
 
																+*/
															
 
																+int starpu_mpi_get_thread_cpuid(void);
															
 
																+
															
 
																 int starpu_mpi_get_communication_tag(void);
															
 
																 void starpu_mpi_set_communication_tag(int tag);
															
--- a/mpi/src/starpu_mpi_datatype.c
+++ b/mpi/src/starpu_mpi_datatype.c
@@ -26,17 +26,16 @@ struct _starpu_mpi_datatype_funcs
 
																 	UT_hash_handle hh;
															
 
																 };
															
 
																-static starpu_pthread_mutex_t _starpu_mpi_datatype_funcs_table_mutex;
															
 
																+/* We want to allow applications calling starpu_mpi_interface_datatype_register/unregister as constructor/destructor */
															
 
																+static starpu_pthread_mutex_t _starpu_mpi_datatype_funcs_table_mutex = STARPU_PTHREAD_MUTEX_INITIALIZER;
															
 
																 static struct _starpu_mpi_datatype_funcs *_starpu_mpi_datatype_funcs_table = NULL;
															
 
																 void _starpu_mpi_datatype_init(void)
															
 
																 {
															
 
																-	STARPU_PTHREAD_MUTEX_INIT(&_starpu_mpi_datatype_funcs_table_mutex, NULL);
															
 
																 }
															
 
																 void _starpu_mpi_datatype_shutdown(void)
															
 
																 {
															
 
																-	STARPU_PTHREAD_MUTEX_DESTROY(&_starpu_mpi_datatype_funcs_table_mutex);
															
 
																 }
															
 
																 /*
															
--- a/mpi/src/starpu_mpi_init.c
+++ b/mpi/src/starpu_mpi_init.c
@@ -336,3 +336,8 @@ int starpu_mpi_world_rank(void)
 
																 	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
															
 
																 	return rank;
															
 
																 }
															
 
																+
															
 
																+int starpu_mpi_get_thread_cpuid(void)
															
 
																+{
															
 
																+	return _starpu_mpi_thread_cpuid;
															
 
																+}
															
--- a/mpi/tests/ring_kernel.cu
+++ b/mpi/tests/ring_kernel.cu
@@ -27,5 +27,7 @@ extern "C" void increment_cuda(void *descr[], void *_args)
 
																 	int *tokenptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
															
 
																 	cuda_incrementer<<<1,1, 0, starpu_cuda_get_local_stream()>>>(tokenptr);
															
 
																+	cudaError_t status = cudaGetLastError();
															
 
																+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
															
 
																 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
															
 
																 }
															
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -411,9 +411,16 @@ endif
 
																 # static inline definition
															
 
																 dist-hook:
															
 
																 	failed=0 ; \
															
 
																-	for i in $$( $(GREP) "static inline" $$(find $(srcdir) -name \*.h) | $(SED) -e 's/.*static inline //g' | $(GREP) -v ENAME | $(SED) -e 's/[^(]* \(\|\*\)\([^ (]*\)(.*/\2/' | $(GREP) -v _starpu_spin_init) ; do \
															
 
																-		for j in $(shell find . -name \*.o) ; do \
															
 
																-			nm $$j | $(GREP) "U $$i$$" && { echo $$j ; failed=1 ; } ; \
															
 
																-		done ; \
															
 
																+	look=""; \
															
 
																+	for i in $$( $(GREP) "static inline" $$(find $(srcdir) -name \*.h) | $(SED) -e 's/.*static inline //g' | $(GREP) -v ENAME\#\# | $(SED) -n -e 's/[^(]* \(\|\*\)\([^ (]*\)(.*/\2/' -e 'p;s/^_*//;p' | $(GREP) -v _starpu_spin_init | $(GREP) -v starpu_sched_ctx_worker_is_master_for_child_ctx) ; do \
															
 
																+		if [ -z "$$look" ] ; then \
															
 
																+			look="$$i" ; \
															
 
																+		else \
															
 
																+			look="$$look\|$$i" ; \
															
 
																+		fi ; \
															
 
																+	done ; \
															
 
																+	echo "$$look" ; \
															
 
																+	for j in $(shell find . -name \*.o) ; do \
															
 
																+		nm $$j | $(GREP) -e "U \($$look\)$$" && { echo $$j ; failed=1 ; } ; \
															
 
																 	done ; \
															
 
																 	[ $$failed == 0 ]
															
--- a/src/core/jobs.c
+++ b/src/core/jobs.c
@@ -24,10 +24,12 @@
 
																 #include <common/config.h>
															
 
																 #include <common/utils.h>
															
 
																 #include <common/graph.h>
															
 
																+#include <datawizard/memory_nodes.h>
															
 
																 #include <profiling/profiling.h>
															
 
																 #include <profiling/bound.h>
															
 
																 #include <core/debug.h>
															
 
																 #include <limits.h>
															
 
																+#include <core/workers.h>
															
 
																 static int max_memory_use;
															
 
																 static unsigned long njobs, maxnjobs;
															
@@ -483,6 +485,7 @@ void _starpu_handle_job_termination(struct _starpu_job *j)
 
																 		 * also the callback were executed. */
															
 
																 		j->terminated = 2;
															
 
																 	}
															
 
																+	task->prefetched = 0;
															
 
																 	STARPU_PTHREAD_COND_BROADCAST(&j->sync_cond);
															
 
																 	STARPU_AYU_REMOVETASK(j->job_id);
															
 
																 	STARPU_PTHREAD_MUTEX_UNLOCK(&j->sync_mutex);
															
--- a/src/core/perfmodel/perfmodel_bus.c
+++ b/src/core/perfmodel/perfmodel_bus.c
@@ -37,6 +37,7 @@
 
																 #include <core/topology.h>
															
 
																 #include <common/utils.h>
															
 
																 #include <drivers/mpi/driver_mpi_common.h>
															
 
																+#include <datawizard/memory_nodes.h>
															
 
																 #ifdef STARPU_USE_OPENCL
															
 
																 #include <starpu_opencl.h>
															
@@ -188,7 +189,7 @@ static void measure_bandwidth_between_host_and_dev_on_numa_with_cuda(int dev, in
 
																 	/* Allocate a buffer on the device */
															
 
																 	unsigned char *d_buffer;
															
 
																 	cures = cudaMalloc((void **)&d_buffer, size);
															
 
																-	STARPU_ASSERT(cures == cudaSuccess);
															
 
																+	if (STARPU_UNLIKELY(cures)) STARPU_CUDA_REPORT_ERROR(cures);
															
 
																 	/* hack to avoid third party libs to rebind threads */
															
 
																 	_starpu_bind_thread_on_cpu(cpu, STARPU_NOWORKERID, NULL);
															
@@ -217,7 +218,7 @@ static void measure_bandwidth_between_host_and_dev_on_numa_with_cuda(int dev, in
 
																 		cudaHostRegister((void *)h_buffer, size, 0);
															
 
																 	}
															
 
																-	STARPU_ASSERT(cures == cudaSuccess);
															
 
																+	if (STARPU_UNLIKELY(cures)) STARPU_CUDA_REPORT_ERROR(cures);
															
 
																 	/* hack to avoid third party libs to rebind threads */
															
 
																 	_starpu_bind_thread_on_cpu(cpu, STARPU_NOWORKERID, NULL);
															
@@ -342,7 +343,7 @@ static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
 
																 	/* Allocate a buffer on the device */
															
 
																 	unsigned char *s_buffer;
															
 
																 	cures = cudaMalloc((void **)&s_buffer, size);
															
 
																-	STARPU_ASSERT(cures == cudaSuccess);
															
 
																+	if (STARPU_UNLIKELY(cures)) STARPU_CUDA_REPORT_ERROR(cures);
															
 
																 	cudaMemset(s_buffer, 0, size);
															
 
																 	cudaDeviceSynchronize();
															
@@ -368,7 +369,7 @@ static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
 
																 	/* Allocate a buffer on the device */
															
 
																 	unsigned char *d_buffer;
															
 
																 	cures = cudaMalloc((void **)&d_buffer, size);
															
 
																-	STARPU_ASSERT(cures == cudaSuccess);
															
 
																+	if (STARPU_UNLIKELY(cures)) STARPU_CUDA_REPORT_ERROR(cures);
															
 
																 	cudaMemset(d_buffer, 0, size);
															
 
																 	cudaDeviceSynchronize();
															
--- a/src/core/perfmodel/perfmodel_print.c
+++ b/src/core/perfmodel/perfmodel_print.c
@@ -19,6 +19,7 @@
 
																 #include <starpu.h>
															
 
																 #include <starpu_perfmodel.h>
															
 
																 #include <common/config.h>
															
 
																+#include <core/workers.h>
															
 
																 #include "perfmodel.h"
															
 
																 static
															
--- a/src/core/sched_ctx.c
+++ b/src/core/sched_ctx.c
@@ -21,6 +21,7 @@
 
																 #include <common/utils.h>
															
 
																 #include <stdarg.h>
															
 
																 #include <core/task.h>
															
 
																+#include <core/workers.h>
															
 
																 enum _starpu_ctx_change_op
															
 
																 {
															
--- a/src/core/sched_policy.c
+++ b/src/core/sched_policy.c
@@ -22,6 +22,7 @@
 
																 #include <common/utils.h>
															
 
																 #include <core/sched_policy.h>
															
 
																 #include <profiling/profiling.h>
															
 
																+#include <datawizard/memory_nodes.h>
															
 
																 #include <common/barrier.h>
															
 
																 #include <core/debug.h>
															
 
																 #include <core/task.h>
															
@@ -569,32 +570,12 @@ int _starpu_push_task_to_workers(struct starpu_task *task)
 
																 	int ret = 0;
															
 
																 	if (STARPU_UNLIKELY(task->execute_on_a_specific_worker))
															
 
																 	{
															
 
																-		if (starpu_get_prefetch_flag())
															
 
																-			starpu_prefetch_task_input_for(task, task->workerid);
															
 
																-
															
 
																 		ret = _starpu_push_task_on_specific_worker(task, task->workerid);
															
 
																 	}
															
 
																 	else
															
 
																 	{
															
 
																 		struct _starpu_machine_config *config = _starpu_get_machine_config();
															
 
																-		/* When a task can only be executed on a given arch and we have
															
 
																-		 * only one memory node for that arch, we can systematically
															
 
																-		 * prefetch before the scheduling decision. */
															
 
																-		if (starpu_get_prefetch_flag() && starpu_memory_nodes_get_count() > 1)
															
 
																-		{
															
 
																-			if (task->where == STARPU_CPU && config->cpus_nodeid >= 0)
															
 
																-				starpu_prefetch_task_input_on_node(task, config->cpus_nodeid);
															
 
																-			else if (task->where == STARPU_CUDA && config->cuda_nodeid >= 0)
															
 
																-				starpu_prefetch_task_input_on_node(task, config->cuda_nodeid);
															
 
																-                        else if (task->cl->where == STARPU_FPGA && config->fpga_nodeid >= 0)
															
 
																-				starpu_prefetch_task_input_on_node(task, config->fpga_nodeid);
															
 
																-			else if (task->where == STARPU_OPENCL && config->opencl_nodeid >= 0)
															
 
																-				starpu_prefetch_task_input_on_node(task, config->opencl_nodeid);
															
 
																-			else if (task->where == STARPU_MIC && config->mic_nodeid >= 0)
															
 
																-				starpu_prefetch_task_input_on_node(task, config->mic_nodeid);
															
 
																-		}
															
 
																-
															
 
																 		if(!sched_ctx->sched_policy)
															
 
																 		{
															
 
																 			/* Note: we have to call that early, or else the task may have
															
@@ -637,6 +618,25 @@ int _starpu_push_task_to_workers(struct starpu_task *task)
 
																 		}
															
 
																 		else
															
 
																 		{
															
 
																+			/* When a task can only be executed on a given arch and we have
															
 
																+			 * only one memory node for that arch, we can systematically
															
 
																+			 * prefetch before the scheduling decision. */
															
 
																+			if (!sched_ctx->sched_policy->prefetches
															
 
																+				&& starpu_get_prefetch_flag()
															
 
																+				&& starpu_memory_nodes_get_count() > 1)
															
 
																+			{
															
 
																+				if (task->where == STARPU_CPU && config->cpus_nodeid >= 0)
															
 
																+					starpu_prefetch_task_input_on_node(task, config->cpus_nodeid);
															
 
																+				else if (task->where == STARPU_CUDA && config->cuda_nodeid >= 0)
															
 
																+					starpu_prefetch_task_input_on_node(task, config->cuda_nodeid);
															
 
																+				else if (task->cl->where == STARPU_FPGA && config->fpga_nodeid >= 0)
															
 
																+					starpu_prefetch_task_input_on_node(task, config->fpga_nodeid);
															
 
																+				else if (task->where == STARPU_OPENCL && config->opencl_nodeid >= 0)
															
 
																+					starpu_prefetch_task_input_on_node(task, config->opencl_nodeid);
															
 
																+				else if (task->where == STARPU_MIC && config->mic_nodeid >= 0)
															
 
																+					starpu_prefetch_task_input_on_node(task, config->mic_nodeid);
															
 
																+			}
															
 
																+
															
 
																 			STARPU_ASSERT(sched_ctx->sched_policy->push_task);
															
 
																 			/* check out if there are any workers in the context */
															
 
																 			unsigned nworkers = starpu_sched_ctx_get_nworkers(sched_ctx->id);
															
--- a/src/core/simgrid.c
+++ b/src/core/simgrid.c
@@ -58,6 +58,8 @@ extern int _starpu_mpi_simgrid_init(int argc, char *argv[]);
 
																 extern void smpi_process_set_user_data(void *);
															
 
																 #endif
															
 
																+static double _starpu_simgrid_dynamic_energy = 0.0;
															
 
																+
															
 
																 /* 1 when MSG_init was done, 2 when initialized through redirected main, 3 when
															
 
																  * initialized through MSG_process_attach */
															
 
																 static int simgrid_started;
															
@@ -629,6 +631,7 @@ struct task
 
																 #else
															
 
																 	msg_task_t task;
															
 
																 #endif
															
 
																+	double energy;
															
 
																 	/* communication termination signalization */
															
 
																 	unsigned *finished;
															
@@ -666,6 +669,7 @@ static void *task_execute(void *arg)
 
																 		MSG_task_execute(task->task);
															
 
																 		MSG_task_destroy(task->task);
															
 
																 #endif
															
 
																+		starpu_energy_use(task->energy);
															
 
																 		_STARPU_DEBUG("task %p finished\n", task);
															
 
																 		*task->finished = 1;
															
@@ -702,7 +706,7 @@ void _starpu_simgrid_wait_tasks(int workerid)
 
																 }
															
 
																 /* Task execution submitted by StarPU */
															
 
																-void _starpu_simgrid_submit_job(int workerid, struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch, double length, unsigned *finished)
															
 
																+void _starpu_simgrid_submit_job(int workerid, int sched_ctx_id, struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch, double length, double energy, unsigned *finished)
															
 
																 {
															
 
																 	struct starpu_task *starpu_task = j->task;
															
 
																 	double flops;
															
@@ -717,13 +721,19 @@ void _starpu_simgrid_submit_job(int workerid, struct _starpu_job *j, struct star
 
																 	if (isnan(length))
															
 
																 	{
															
 
																-		length = starpu_task_expected_length(starpu_task, perf_arch, j->nimpl);
															
 
																+		length = starpu_task_worker_expected_length(starpu_task, workerid, sched_ctx_id, j->nimpl);
															
 
																 		STARPU_ASSERT_MSG(!_STARPU_IS_ZERO(length) && !isnan(length),
															
 
																 				  "Codelet %s does not have a perfmodel (in directory %s), or is not calibrated enough, please re-run in non-simgrid mode until it is calibrated",
															
 
																 				  _starpu_job_get_model_name(j), _starpu_get_perf_model_dir_codelet());
															
 
																                 /* TODO: option to add variance according to performance model,
															
 
																                  * to be able to easily check scheduling robustness */
															
 
																 	}
															
 
																+	if (isnan(energy))
															
 
																+	{
															
 
																+		energy = starpu_task_worker_expected_energy(starpu_task, workerid, sched_ctx_id, j->nimpl);
															
 
																+		/* TODO: option to add variance according to performance model,
															
 
																+		 * to be able to easily check scheduling robustness */
															
 
																+	}
															
 
																 #if defined(HAVE_SG_HOST_SPEED) || defined(sg_host_speed)
															
 
																 #  if defined(HAVE_SG_HOST_SELF) || defined(sg_host_self)
															
@@ -754,6 +764,7 @@ void _starpu_simgrid_submit_job(int workerid, struct _starpu_job *j, struct star
 
																 		MSG_task_execute(simgrid_task);
															
 
																 		MSG_task_destroy(simgrid_task);
															
 
																 #endif
															
 
																+		starpu_energy_use(energy);
															
 
																 	}
															
 
																 	else
															
 
																 	{
															
@@ -766,6 +777,7 @@ void _starpu_simgrid_submit_job(int workerid, struct _starpu_job *j, struct star
 
																 #else
															
 
																 		task->task = simgrid_task;
															
 
																 #endif
															
 
																+		task->energy = energy;
															
 
																 		task->finished = finished;
															
 
																 		*finished = 0;
															
 
																 		task->next = NULL;
															
@@ -1391,5 +1403,15 @@ void _starpu_simgrid_data_transfer(size_t size, unsigned src_node, unsigned dst_
 
																 }
															
 
																 #endif
															
 
																+void starpu_energy_use(float joules)
															
 
																+{
															
 
																+	_starpu_simgrid_dynamic_energy += joules;
															
 
																+}
															
 
																+
															
 
																+double starpu_energy_used(void)
															
 
																+{
															
 
																+	float idle_power = starpu_get_env_float_default("STARPU_IDLE_POWER", 0.0);
															
 
																+	return _starpu_simgrid_dynamic_energy + idle_power * starpu_timing_now() / 1000000;
															
 
																+}
															
 
																 #endif
															
--- a/src/core/simgrid.h
+++ b/src/core/simgrid.h
@@ -66,7 +66,7 @@ void _starpu_simgrid_deinit_late(void);
 
																 void _starpu_simgrid_actor_setup(void);
															
 
																 void _starpu_simgrid_wait_tasks(int workerid);
															
 
																 struct _starpu_job;
															
 
																-void _starpu_simgrid_submit_job(int workerid, struct _starpu_job *job, struct starpu_perfmodel_arch* perf_arch, double length, unsigned *finished);
															
 
																+void _starpu_simgrid_submit_job(int workerid, int sched_ctx_id, struct _starpu_job *job, struct starpu_perfmodel_arch* perf_arch, double length, double energy, unsigned *finished);
															
 
																 struct _starpu_data_request;
															
 
																 int _starpu_simgrid_transfer(size_t size, unsigned src_node, unsigned dst_node, struct _starpu_data_request *req);
															
 
																 union _starpu_async_channel_event;
															
--- a/src/core/task.c
+++ b/src/core/task.c
@@ -30,6 +30,7 @@
 
																 #include <common/utils.h>
															
 
																 #include <common/fxt.h>
															
 
																 #include <common/knobs.h>
															
 
																+#include <datawizard/memory_nodes.h>
															
 
																 #include <profiling/profiling.h>
															
 
																 #include <profiling/bound.h>
															
 
																 #include <math.h>
															
--- a/src/core/topology.c
+++ b/src/core/topology.c
@@ -2086,7 +2086,11 @@ int _starpu_bind_thread_on_cpu(int cpuid STARPU_ATTRIBUTE_UNUSED, int workerid S
 
																 		{
															
 
																 			cpu_worker[cpuid] = workerid;
															
 
																 			if (name)
															
 
																+			{
															
 
																+				if (cpu_name[cpuid])
															
 
																+					free(cpu_name[cpuid]);
															
 
																 				cpu_name[cpuid] = strdup(name);
															
 
																+			}
															
 
																 		}
															
 
																 	}
															
@@ -3219,3 +3223,26 @@ void starpu_topology_print(FILE *output)
 
																 		fprintf(output, "\n");
															
 
																 	}
															
 
																 }
															
 
																+
															
 
																+int starpu_get_pu_os_index(unsigned logical_index)
															
 
																+{
															
 
																+#ifdef STARPU_HAVE_HWLOC
															
 
																+	struct _starpu_machine_config *config = _starpu_get_machine_config();
															
 
																+	struct _starpu_machine_topology *topology = &config->topology;
															
 
																+
															
 
																+	hwloc_topology_t topo = topology->hwtopology;
															
 
																+
															
 
																+	return hwloc_get_obj_by_type(topo, HWLOC_OBJ_PU, logical_index)->os_index;
															
 
																+#else
															
 
																+	return logical_index;
															
 
																+#endif
															
 
																+}
															
 
																+
															
 
																+#ifdef STARPU_HAVE_HWLOC
															
 
																+hwloc_topology_t starpu_get_hwloc_topology(void)
															
 
																+{
															
 
																+	struct _starpu_machine_config *config = _starpu_get_machine_config();
															
 
																+
															
 
																+	return config->topology.hwtopology;
															
 
																+}
															
 
																+#endif
															
--- a/src/core/workers.c
+++ b/src/core/workers.c
@@ -2692,31 +2692,37 @@ int starpu_worker_get_relax_state(void)
 
																 	return _starpu_worker_get_relax_state();
															
 
																 }
															
 
																+#undef starpu_worker_lock
															
 
																 void starpu_worker_lock(int workerid)
															
 
																 {
															
 
																 	_starpu_worker_lock(workerid);
															
 
																 }
															
 
																+#undef starpu_worker_trylock
															
 
																 int starpu_worker_trylock(int workerid)
															
 
																 {
															
 
																 	return _starpu_worker_trylock(workerid);
															
 
																 }
															
 
																+#undef starpu_worker_unlock
															
 
																 void starpu_worker_unlock(int workerid)
															
 
																 {
															
 
																 	_starpu_worker_unlock(workerid);
															
 
																 }
															
 
																+#undef starpu_worker_lock_self
															
 
																 void starpu_worker_lock_self(void)
															
 
																 {
															
 
																 	_starpu_worker_lock_self();
															
 
																 }
															
 
																+#undef starpu_worker_unlock_self
															
 
																 void starpu_worker_unlock_self(void)
															
 
																 {
															
 
																 	_starpu_worker_unlock_self();
															
 
																 }
															
 
																+#undef starpu_wake_worker_relax
															
 
																 int starpu_wake_worker_relax(int workerid)
															
 
																 {
															
 
																 	return _starpu_wake_worker_relax(workerid);
															
--- a/src/core/workers.h
+++ b/src/core/workers.h
@@ -1132,6 +1132,7 @@ static inline void _starpu_worker_lock(int workerid)
 
																 		STARPU_PTHREAD_MUTEX_LOCK_SCHED(&worker->sched_mutex);
															
 
																 	}
															
 
																 }
															
 
																+#define starpu_worker_lock _starpu_worker_lock
															
 
																 static inline int _starpu_worker_trylock(int workerid)
															
 
																 {
															
@@ -1162,6 +1163,7 @@ static inline int _starpu_worker_trylock(int workerid)
 
																 	STARPU_PTHREAD_MUTEX_UNLOCK_SCHED(&cur_worker->sched_mutex);
															
 
																 	return ret;
															
 
																 }
															
 
																+#define starpu_worker_trylock _starpu_worker_trylock
															
 
																 static inline void _starpu_worker_unlock(int workerid)
															
 
																 {
															
@@ -1174,6 +1176,7 @@ static inline void _starpu_worker_unlock(int workerid)
 
																 		starpu_worker_relax_off();
															
 
																 	}
															
 
																 }
															
 
																+#define starpu_worker_unlock _starpu_worker_unlock
															
 
																 static inline void _starpu_worker_lock_self(void)
															
 
																 {
															
@@ -1182,6 +1185,7 @@ static inline void _starpu_worker_lock_self(void)
 
																 	STARPU_ASSERT(worker != NULL);
															
 
																 	STARPU_PTHREAD_MUTEX_LOCK_SCHED(&worker->sched_mutex);
															
 
																 }
															
 
																+#define starpu_worker_lock_self _starpu_worker_lock_self
															
 
																 static inline void _starpu_worker_unlock_self(void)
															
 
																 {
															
@@ -1190,6 +1194,7 @@ static inline void _starpu_worker_unlock_self(void)
 
																 	STARPU_ASSERT(worker != NULL);
															
 
																 	STARPU_PTHREAD_MUTEX_UNLOCK_SCHED(&worker->sched_mutex);
															
 
																 }
															
 
																+#define starpu_worker_unlock_self _starpu_worker_unlock_self
															
 
																 static inline int _starpu_wake_worker_relax(int workerid)
															
 
																 {
															
@@ -1198,6 +1203,7 @@ static inline int _starpu_wake_worker_relax(int workerid)
 
																 	_starpu_worker_unlock(workerid);
															
 
																 	return ret;
															
 
																 }
															
 
																+#define starpu_wake_worker_relax _starpu_wake_worker_relax
															
 
																 int starpu_wake_worker_relax_light(int workerid);
															
--- a/src/datawizard/coherency.c
+++ b/src/datawizard/coherency.c
@@ -411,7 +411,7 @@ int _starpu_determine_request_path(starpu_data_handle_t handle,
 
																 /* handle->lock should be taken. r is returned locked. The node parameter
															
 
																  * indicate either the source of the request, or the destination for a
															
 
																  * write-only request. */
															
 
																-static struct _starpu_data_request *_starpu_search_existing_data_request(struct _starpu_data_replicate *replicate, unsigned node, enum starpu_data_access_mode mode, unsigned is_prefetch)
															
 
																+static struct _starpu_data_request *_starpu_search_existing_data_request(struct _starpu_data_replicate *replicate, unsigned node, enum starpu_data_access_mode mode, enum _starpu_is_prefetch is_prefetch)
															
 
																 {
															
 
																 	struct _starpu_data_request *r;
															
@@ -474,7 +474,7 @@ static struct _starpu_data_request *_starpu_search_existing_data_request(struct
 
																 struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_handle_t handle,
															
 
																 								  struct _starpu_data_replicate *dst_replicate,
															
 
																-								  enum starpu_data_access_mode mode, unsigned is_prefetch,
															
 
																+								  enum starpu_data_access_mode mode, enum _starpu_is_prefetch is_prefetch,
															
 
																 								  unsigned async,
															
 
																 								  void (*callback_func)(void *), void *callback_arg, int prio, const char *origin)
															
 
																 {
															
@@ -529,7 +529,13 @@ struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_ha
 
																 #endif
															
 
																 			if (dst_replicate->mc)
															
 
																+			{
															
 
																+				if (is_prefetch == STARPU_TASK_PREFETCH)
															
 
																+					/* Make sure it stays there */
															
 
																+					dst_replicate->mc->nb_tasks_prefetch++;
															
 
																+
															
 
																 				_starpu_memchunk_recently_used(dst_replicate->mc, requesting_node);
															
 
																+			}
															
 
																 		}
															
 
																 		_starpu_spin_unlock(&handle->header_lock);
															
@@ -574,6 +580,9 @@ struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_ha
 
																 			if (_starpu_allocate_memory_on_node(handle, dst_replicate, is_prefetch) == 0)
															
 
																 			{
															
 
																 				_starpu_update_data_state(handle, dst_replicate, mode);
															
 
																+				if (is_prefetch == STARPU_TASK_PREFETCH)
															
 
																+					/* Make sure it stays there */
															
 
																+					dst_replicate->mc->nb_tasks_prefetch++;
															
 
																 				_starpu_spin_unlock(&handle->header_lock);
															
@@ -652,9 +661,17 @@ struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_ha
 
																 				STARPU_ASSERT(r->next_req_count <= STARPU_MAXNODES);
															
 
																 			}
															
 
																 		}
															
 
																-		else if (!write_invalidation)
															
 
																-			/* The last request will perform the callback after termination */
															
 
																-			_starpu_data_request_append_callback(r, callback_func, callback_arg);
															
 
																+		else
															
 
																+		{
															
 
																+			if (is_prefetch == STARPU_TASK_PREFETCH)
															
 
																+				/* Make last request add the prefetch count on the mc to keep the data
															
 
																+				 * there until the task gets to execute.  */
															
 
																+				r->nb_tasks_prefetch++;
															
 
																+
															
 
																+			if (!write_invalidation)
															
 
																+				/* The last request will perform the callback after termination */
															
 
																+				_starpu_data_request_append_callback(r, callback_func, callback_arg);
															
 
																+		}
															
 
																 		if (reused_requests[hop])
															
@@ -719,7 +736,7 @@ struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_ha
 
																 }
															
 
																 int _starpu_fetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *dst_replicate,
															
 
																-			       enum starpu_data_access_mode mode, unsigned detached, unsigned is_prefetch, unsigned async,
															
 
																+			       enum starpu_data_access_mode mode, unsigned detached, enum _starpu_is_prefetch is_prefetch, unsigned async,
															
 
																 			       void (*callback_func)(void *), void *callback_arg, int prio, const char *origin)
															
 
																 {
															
 
																         _STARPU_LOG_IN();
															
@@ -733,7 +750,7 @@ int _starpu_fetch_data_on_node(starpu_data_handle_t handle, int node, struct _st
 
																 	if (cpt == STARPU_SPIN_MAXTRY)
															
 
																 		_starpu_spin_lock(&handle->header_lock);
															
 
																-	if (is_prefetch > 0)
															
 
																+	if (is_prefetch > STARPU_FETCH)
															
 
																 	{
															
 
																 		unsigned src_node_mask = 0;
															
@@ -751,6 +768,7 @@ int _starpu_fetch_data_on_node(starpu_data_handle_t handle, int node, struct _st
 
																 		if (src_node_mask == 0)
															
 
																 		{
															
 
																 			/* no valid copy, nothing to prefetch */
															
 
																+			STARPU_ASSERT_MSG(handle->init_cl, "Could not find a valid copy of the data, and no handle initialization function");
															
 
																 			_starpu_spin_unlock(&handle->header_lock);
															
 
																 			return 0;
															
 
																 		}
															
@@ -789,17 +807,22 @@ int _starpu_fetch_data_on_node(starpu_data_handle_t handle, int node, struct _st
 
																 static int idle_prefetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate, enum starpu_data_access_mode mode, int prio)
															
 
																 {
															
 
																-	return _starpu_fetch_data_on_node(handle, node, replicate, mode, 1, 2, 1, NULL, NULL, prio, "idle_prefetch_data_on_node");
															
 
																+	return _starpu_fetch_data_on_node(handle, node, replicate, mode, 1, STARPU_IDLEFETCH, 1, NULL, NULL, prio, "idle_prefetch_data_on_node");
															
 
																 }
															
 
																-static int prefetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate, enum starpu_data_access_mode mode, int prio)
															
 
																+static int task_prefetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate, enum starpu_data_access_mode mode, int prio)
															
 
																 {
															
 
																-	return _starpu_fetch_data_on_node(handle, node, replicate, mode, 1, 1, 1, NULL, NULL, prio, "prefetch_data_on_node");
															
 
																+	return _starpu_fetch_data_on_node(handle, node, replicate, mode, 1, STARPU_TASK_PREFETCH, 1, NULL, NULL, prio, "task_prefetch_data_on_node");
															
 
																+}
															
 
																+
															
 
																+static int STARPU_ATTRIBUTE_UNUSED prefetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate, enum starpu_data_access_mode mode, int prio)
															
 
																+{
															
 
																+	return _starpu_fetch_data_on_node(handle, node, replicate, mode, 1, STARPU_PREFETCH, 1, NULL, NULL, prio, "prefetch_data_on_node");
															
 
																 }
															
 
																 static int fetch_data(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate, enum starpu_data_access_mode mode, int prio)
															
 
																 {
															
 
																-	return _starpu_fetch_data_on_node(handle, node, replicate, mode, 0, 0, 0, NULL, NULL, prio, "fetch_data");
															
 
																+	return _starpu_fetch_data_on_node(handle, node, replicate, mode, 0, STARPU_FETCH, 0, NULL, NULL, prio, "fetch_data");
															
 
																 }
															
 
																 uint32_t _starpu_get_data_refcnt(starpu_data_handle_t handle, unsigned node)
															
@@ -899,7 +922,7 @@ int starpu_prefetch_task_input_on_node_prio(struct starpu_task *task, unsigned t
 
																 	if (j->discontinuous != 0)
															
 
																 		return 0;
															
 
																 #endif
															
 
																-	STARPU_ASSERT(!task->prefetched);
															
 
																+	STARPU_ASSERT_MSG(!task->prefetched, "Prefetching was already requested for this task! Did you set 'prefetches' to 1 in the starpu_sched_policy structure?");
															
 
																 	unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
															
 
																 	unsigned index;
															
@@ -918,10 +941,11 @@ int starpu_prefetch_task_input_on_node_prio(struct starpu_task *task, unsigned t
 
																 		int node = _starpu_task_data_get_node_on_node(task, index, target_node);
															
 
																 		struct _starpu_data_replicate *replicate = &handle->per_node[node];
															
 
																-		prefetch_data_on_node(handle, node, replicate, mode, prio);
															
 
																+		task_prefetch_data_on_node(handle, node, replicate, mode, prio);
															
 
																 		_starpu_set_data_requested_flag_if_needed(handle, replicate);
															
 
																 	}
															
 
																+	task->prefetched = 1;
															
 
																 	return 0;
															
 
																 }
															
@@ -976,7 +1000,7 @@ int starpu_prefetch_task_input_for_prio(struct starpu_task *task, unsigned worke
 
																 	if (j->discontinuous != 0)
															
 
																 		return 0;
															
 
																 #endif
															
 
																-	STARPU_ASSERT(!task->prefetched);
															
 
																+	STARPU_ASSERT_MSG(!task->prefetched, "Prefetching was already requested for this task! Did you set 'prefetches' to 1 in the starpu_sched_policy structure?");
															
 
																 	unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
															
 
																 	unsigned index;
															
@@ -995,10 +1019,11 @@ int starpu_prefetch_task_input_for_prio(struct starpu_task *task, unsigned worke
 
																 		int node = _starpu_task_data_get_node_on_worker(task, index, worker);
															
 
																 		struct _starpu_data_replicate *replicate = &handle->per_node[node];
															
 
																-		prefetch_data_on_node(handle, node, replicate, mode, prio);
															
 
																+		task_prefetch_data_on_node(handle, node, replicate, mode, prio);
															
 
																 		_starpu_set_data_requested_flag_if_needed(handle, replicate);
															
 
																 	}
															
 
																+	task->prefetched = 1;
															
 
																 	return 0;
															
 
																 }
															
@@ -1140,7 +1165,7 @@ int _starpu_fetch_task_input(struct starpu_task *task, struct _starpu_job *j, in
 
																 		if (async)
															
 
																 		{
															
 
																-			ret = _starpu_fetch_data_on_node(handle, node, local_replicate, mode, 0, 0, 1,
															
 
																+			ret = _starpu_fetch_data_on_node(handle, node, local_replicate, mode, 0, STARPU_FETCH, 1,
															
 
																 					_starpu_fetch_task_input_cb, worker, 0, "_starpu_fetch_task_input");
															
 
																 #ifdef STARPU_SIMGRID
															
 
																 			if (_starpu_simgrid_fetching_input_cost())
															
@@ -1230,7 +1255,23 @@ void _starpu_fetch_task_input_tail(struct starpu_task *task, struct _starpu_job
 
																 		local_replicate = get_replicate(handle, mode, workerid, node);
															
 
																 		_starpu_spin_lock(&handle->header_lock);
															
 
																 		if (local_replicate->mc)
															
 
																+		{
															
 
																 			local_replicate->mc->diduse = 1;
															
 
																+			if (task->prefetched && local_replicate->initialized &&
															
 
																+				/* See prefetch conditions in
															
 
																+				 * starpu_prefetch_task_input_on_node_prio and alike */
															
 
																+				!(mode & (STARPU_SCRATCH|STARPU_REDUX)) &&
															
 
																+				(mode & STARPU_R))
															
 
																+			{
															
 
																+				/* Allocations or transfer prefetchs should have been done by now and marked
															
 
																+				 * this mc as needed for us.
															
 
																+				 * Now that we added a reference for the task, we can relieve that.  */
															
 
																+				/* Note: the replicate might have been evicted in between, thus not 100% sure
															
 
																+				 * that our prefetch request is still recorded here.  */
															
 
																+				if (local_replicate->mc->nb_tasks_prefetch > 0)
															
 
																+					local_replicate->mc->nb_tasks_prefetch--;
															
 
																+			}
															
 
																+		}
															
 
																 		_starpu_spin_unlock(&handle->header_lock);
															
 
																 		_STARPU_TASK_SET_INTERFACE(task , local_replicate->data_interface, descrs[index].index);
															
@@ -1379,7 +1420,7 @@ void _starpu_fetch_nowhere_task_input(struct _starpu_job *j)
 
																 		local_replicate = get_replicate(handle, mode, -1, node);
															
 
																-		_starpu_fetch_data_on_node(handle, node, local_replicate, mode, 0, 0, 1, _starpu_fetch_nowhere_task_input_cb, wrapper, 0, "_starpu_fetch_nowhere_task_input");
															
 
																+		_starpu_fetch_data_on_node(handle, node, local_replicate, mode, 0, STARPU_FETCH, 1, _starpu_fetch_nowhere_task_input_cb, wrapper, 0, "_starpu_fetch_nowhere_task_input");
															
 
																 	}
															
 
																 	if (profiling && task->profiling_info)
															
--- a/src/datawizard/coherency.h
+++ b/src/datawizard/coherency.h
@@ -298,7 +298,7 @@ struct _starpu_data_state
 
																  * async means that _starpu_fetch_data_on_node will wait for completion of the request
															
 
																  */
															
 
																 int _starpu_fetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate,
															
 
																-			       enum starpu_data_access_mode mode, unsigned detached, unsigned is_prefetch, unsigned async,
															
 
																+			       enum starpu_data_access_mode mode, unsigned detached, enum _starpu_is_prefetch is_prefetch, unsigned async,
															
 
																 			       void (*callback_func)(void *), void *callback_arg, int prio, const char *origin);
															
 
																 /** This releases a reference on the handle */
															
 
																 void _starpu_release_data_on_node(struct _starpu_data_state *state, uint32_t default_wt_mask,
															
@@ -341,7 +341,7 @@ int _starpu_determine_request_path(starpu_data_handle_t handle,
 
																  */
															
 
																 struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_handle_t handle,
															
 
																 								  struct _starpu_data_replicate *dst_replicate,
															
 
																-								  enum starpu_data_access_mode mode, unsigned is_prefetch,
															
 
																+								  enum starpu_data_access_mode mode, enum _starpu_is_prefetch is_prefetch,
															
 
																 								  unsigned async,
															
 
																 								  void (*callback_func)(void *), void *callback_arg, int prio, const char *origin);
															
--- a/src/datawizard/copy_driver.c
+++ b/src/datawizard/copy_driver.c
@@ -203,7 +203,7 @@ int STARPU_ATTRIBUTE_WARN_UNUSED_RESULT _starpu_driver_copy_data_1_to_1(starpu_d
 
																 									unsigned donotread,
															
 
																 									struct _starpu_data_request *req,
															
 
																 									unsigned may_alloc,
															
 
																-									unsigned prefetch STARPU_ATTRIBUTE_UNUSED)
															
 
																+									enum _starpu_is_prefetch prefetch STARPU_ATTRIBUTE_UNUSED)
															
 
																 {
															
 
																 	if (!donotread)
															
 
																 	{
															
@@ -221,7 +221,7 @@ int STARPU_ATTRIBUTE_WARN_UNUSED_RESULT _starpu_driver_copy_data_1_to_1(starpu_d
 
																 			/* We're not supposed to allocate there at the moment */
															
 
																 			return -ENOMEM;
															
 
																-		int ret_alloc = _starpu_allocate_memory_on_node(handle, dst_replicate, req ? req->prefetch : 0);
															
 
																+		int ret_alloc = _starpu_allocate_memory_on_node(handle, dst_replicate, req ? req->prefetch : STARPU_FETCH);
															
 
																 		if (ret_alloc)
															
 
																 			return -ENOMEM;
															
 
																 	}
															
--- a/src/datawizard/copy_driver.h
+++ b/src/datawizard/copy_driver.h
@@ -47,6 +47,15 @@ extern "C"
 
																 struct _starpu_data_request;
															
 
																 struct _starpu_data_replicate;
															
 
																+enum _starpu_is_prefetch
															
 
																+{
															
 
																+	STARPU_FETCH = 0,		/* A task really needs it now! */
															
 
																+	STARPU_TASK_PREFETCH = 1,	/* A task will need it soon */
															
 
																+	STARPU_PREFETCH = 2,		/* It is a good idea to have it asap */
															
 
																+	STARPU_IDLEFETCH = 3,		/* Get this here when you have time to */
															
 
																+	STARPU_NFETCH
															
 
																+};
															
 
																+
															
 
																 #ifdef STARPU_USE_MIC
															
 
																 /** MIC needs memory_node to know which MIC is concerned.
															
 
																  * mark is used to wait asynchronous request.
															
@@ -132,7 +141,7 @@ int _starpu_driver_copy_data_1_to_1(starpu_data_handle_t handle,
 
																 				    unsigned donotread,
															
 
																 				    struct _starpu_data_request *req,
															
 
																 				    unsigned may_alloc,
															
 
																-				    unsigned prefetch);
															
 
																+				    enum _starpu_is_prefetch prefetch);
															
 
																 unsigned _starpu_driver_test_request_completion(struct _starpu_async_channel *async_channel);
															
 
																 void _starpu_driver_wait_request_completion(struct _starpu_async_channel *async_channel);
															
--- a/src/datawizard/data_request.c
+++ b/src/datawizard/data_request.c
@@ -25,8 +25,11 @@
 
																 #include <core/simgrid.h>
															
 
																 /* requests that have not been treated at all */
															
 
																+#ifdef STARPU_DEVEL
															
 
																+#warning split into separate out/in queues for each node, so that MAX_PENDING_REQUESTS_PER_NODE is separate for them, since the links are bidirectionnal
															
 
																+#endif
															
 
																 static struct _starpu_data_request_prio_list data_requests[STARPU_MAXNODES];
															
 
																-static struct _starpu_data_request_prio_list prefetch_requests[STARPU_MAXNODES];
															
 
																+static struct _starpu_data_request_prio_list prefetch_requests[STARPU_MAXNODES]; /* Contains both task_prefetch and prefetch */
															
 
																 static struct _starpu_data_request_prio_list idle_requests[STARPU_MAXNODES];
															
 
																 static starpu_pthread_mutex_t data_requests_list_mutex[STARPU_MAXNODES];
															
@@ -121,7 +124,7 @@ struct _starpu_data_request *_starpu_create_data_request(starpu_data_handle_t ha
 
																 							 int handling_node,
															
 
																 							 enum starpu_data_access_mode mode,
															
 
																 							 unsigned ndeps,
															
 
																-							 unsigned is_prefetch,
															
 
																+							 enum _starpu_is_prefetch is_prefetch,
															
 
																 							 int prio,
															
 
																 							 unsigned is_write_invalidation,
															
 
																 							 const char *origin)
															
@@ -153,6 +156,7 @@ struct _starpu_data_request *_starpu_create_data_request(starpu_data_handle_t ha
 
																 	STARPU_ASSERT(starpu_node_get_kind(handling_node) == STARPU_CPU_RAM || _starpu_memory_node_get_nworkers(handling_node));
															
 
																 	r->completed = 0;
															
 
																 	r->prefetch = is_prefetch;
															
 
																+	r->nb_tasks_prefetch = 0;
															
 
																 	r->prio = prio;
															
 
																 	r->retval = -1;
															
 
																 	r->ndeps = ndeps;
															
@@ -307,9 +311,9 @@ void _starpu_post_data_request(struct _starpu_data_request *r)
 
																 	/* insert the request in the proper list */
															
 
																 	STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[handling_node]);
															
 
																-	if (r->prefetch == 2)
															
 
																+	if (r->prefetch >= STARPU_IDLEFETCH)
															
 
																 		_starpu_data_request_prio_list_push_back(&idle_requests[handling_node], r);
															
 
																-	else if (r->prefetch)
															
 
																+	else if (r->prefetch > STARPU_FETCH)
															
 
																 		_starpu_data_request_prio_list_push_back(&prefetch_requests[handling_node], r);
															
 
																 	else
															
 
																 		_starpu_data_request_prio_list_push_back(&data_requests[handling_node], r);
															
@@ -410,6 +414,10 @@ static void starpu_handle_data_request_completion(struct _starpu_data_request *r
 
																 	/* Remove a reference on the destination replicate for the request */
															
 
																 	if (dst_replicate)
															
 
																 	{
															
 
																+		if (dst_replicate->mc)
															
 
																+			/* Make sure it stays there for the task.  */
															
 
																+			dst_replicate->mc->nb_tasks_prefetch += r->nb_tasks_prefetch;
															
 
																+
															
 
																 		STARPU_ASSERT(dst_replicate->refcnt > 0);
															
 
																 		dst_replicate->refcnt--;
															
 
																 	}
															
@@ -460,7 +468,7 @@ static void starpu_handle_data_request_completion(struct _starpu_data_request *r
 
																 }
															
 
																 /* TODO : accounting to see how much time was spent working for other people ... */
															
 
																-static int starpu_handle_data_request(struct _starpu_data_request *r, unsigned may_alloc, int prefetch)
															
 
																+static int starpu_handle_data_request(struct _starpu_data_request *r, unsigned may_alloc, enum _starpu_is_prefetch prefetch)
															
 
																 {
															
 
																 	starpu_data_handle_t handle = r->handle;
															
@@ -535,7 +543,7 @@ static int starpu_handle_data_request(struct _starpu_data_request *r, unsigned m
 
																 	return 0;
															
 
																 }
															
 
																-static int __starpu_handle_node_data_requests(struct _starpu_data_request_prio_list *reqlist, unsigned src_node, unsigned may_alloc, unsigned n, unsigned *pushed, unsigned prefetch)
															
 
																+static int __starpu_handle_node_data_requests(struct _starpu_data_request_prio_list *reqlist, unsigned src_node, unsigned may_alloc, unsigned n, unsigned *pushed, enum _starpu_is_prefetch prefetch)
															
 
																 {
															
 
																 	struct _starpu_data_request *r;
															
 
																 	struct _starpu_data_request_prio_list new_data_requests[prefetch + 1]; /* Indexed by prefetch level */
															
@@ -606,7 +614,7 @@ static int __starpu_handle_node_data_requests(struct _starpu_data_request_prio_l
 
																 			ret = res;
															
 
																 			/* Prefetch requests might have gotten promoted while in tmp list */
															
 
																 			_starpu_data_request_prio_list_push_back(&new_data_requests[r->prefetch], r);
															
 
																-			if (prefetch)
															
 
																+			if (prefetch > STARPU_FETCH)
															
 
																 				/* Prefetching more there would make the situation even worse */
															
 
																 				break;
															
 
																 		}
															
@@ -636,20 +644,25 @@ static int __starpu_handle_node_data_requests(struct _starpu_data_request_prio_l
 
																 	if (i <= prefetch)
															
 
																 	{
															
 
																 		STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[src_node]);
															
 
																-		if (!(_starpu_data_request_prio_list_empty(&new_data_requests[0])))
															
 
																+		if (!(_starpu_data_request_prio_list_empty(&new_data_requests[STARPU_FETCH])))
															
 
																+		{
															
 
																+			_starpu_data_request_prio_list_push_prio_list_back(&new_data_requests[STARPU_FETCH], &data_requests[src_node]);
															
 
																+			data_requests[src_node] = new_data_requests[STARPU_FETCH];
															
 
																+		}
															
 
																+		if (prefetch >= STARPU_TASK_PREFETCH && !(_starpu_data_request_prio_list_empty(&new_data_requests[STARPU_TASK_PREFETCH])))
															
 
																 		{
															
 
																-			_starpu_data_request_prio_list_push_prio_list_back(&new_data_requests[0], &data_requests[src_node]);
															
 
																-			data_requests[src_node] = new_data_requests[0];
															
 
																+			_starpu_data_request_prio_list_push_prio_list_back(&new_data_requests[STARPU_TASK_PREFETCH], &prefetch_requests[src_node]);
															
 
																+			prefetch_requests[src_node] = new_data_requests[STARPU_TASK_PREFETCH];
															
 
																 		}
															
 
																-		if (prefetch >= 1 && !(_starpu_data_request_prio_list_empty(&new_data_requests[1])))
															
 
																+		if (prefetch >= STARPU_PREFETCH && !(_starpu_data_request_prio_list_empty(&new_data_requests[STARPU_PREFETCH])))
															
 
																 		{
															
 
																-			_starpu_data_request_prio_list_push_prio_list_back(&new_data_requests[1], &prefetch_requests[src_node]);
															
 
																-			prefetch_requests[src_node] = new_data_requests[1];
															
 
																+			_starpu_data_request_prio_list_push_prio_list_back(&new_data_requests[STARPU_PREFETCH], &prefetch_requests[src_node]);
															
 
																+			prefetch_requests[src_node] = new_data_requests[STARPU_PREFETCH];
															
 
																 		}
															
 
																-		if (prefetch >= 2 && !(_starpu_data_request_prio_list_empty(&new_data_requests[2])))
															
 
																+		if (prefetch >= STARPU_IDLEFETCH && !(_starpu_data_request_prio_list_empty(&new_data_requests[STARPU_IDLEFETCH])))
															
 
																 		{
															
 
																-			_starpu_data_request_prio_list_push_prio_list_back(&new_data_requests[2], &idle_requests[src_node]);
															
 
																-			idle_requests[src_node] = new_data_requests[2];
															
 
																+			_starpu_data_request_prio_list_push_prio_list_back(&new_data_requests[STARPU_IDLEFETCH], &idle_requests[src_node]);
															
 
																+			idle_requests[src_node] = new_data_requests[STARPU_IDLEFETCH];
															
 
																 		}
															
 
																 		STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[src_node]);
															
@@ -675,17 +688,17 @@ static int __starpu_handle_node_data_requests(struct _starpu_data_request_prio_l
 
																 int _starpu_handle_node_data_requests(unsigned src_node, unsigned may_alloc, unsigned *pushed)
															
 
																 {
															
 
																-	return __starpu_handle_node_data_requests(data_requests, src_node, may_alloc, MAX_PENDING_REQUESTS_PER_NODE, pushed, 0);
															
 
																+	return __starpu_handle_node_data_requests(data_requests, src_node, may_alloc, MAX_PENDING_REQUESTS_PER_NODE, pushed, STARPU_FETCH);
															
 
																 }
															
 
																 int _starpu_handle_node_prefetch_requests(unsigned src_node, unsigned may_alloc, unsigned *pushed)
															
 
																 {
															
 
																-	return __starpu_handle_node_data_requests(prefetch_requests, src_node, may_alloc, MAX_PENDING_PREFETCH_REQUESTS_PER_NODE, pushed, 1);
															
 
																+	return __starpu_handle_node_data_requests(prefetch_requests, src_node, may_alloc, MAX_PENDING_PREFETCH_REQUESTS_PER_NODE, pushed, STARPU_PREFETCH);
															
 
																 }
															
 
																 int _starpu_handle_node_idle_requests(unsigned src_node, unsigned may_alloc, unsigned *pushed)
															
 
																 {
															
 
																-	return __starpu_handle_node_data_requests(idle_requests, src_node, may_alloc, MAX_PENDING_IDLE_REQUESTS_PER_NODE, pushed, 2);
															
 
																+	return __starpu_handle_node_data_requests(idle_requests, src_node, may_alloc, MAX_PENDING_IDLE_REQUESTS_PER_NODE, pushed, STARPU_IDLEFETCH);
															
 
																 }
															
 
																 static int _handle_pending_node_data_requests(unsigned src_node, unsigned force)
															
@@ -836,11 +849,15 @@ int _starpu_check_that_no_data_request_is_pending(unsigned node)
 
																 }
															
 
																-void _starpu_update_prefetch_status(struct _starpu_data_request *r, unsigned prefetch)
															
 
																+void _starpu_update_prefetch_status(struct _starpu_data_request *r, enum _starpu_is_prefetch prefetch)
															
 
																 {
															
 
																 	STARPU_ASSERT(r->prefetch > prefetch);
															
 
																 	r->prefetch=prefetch;
															
 
																+	if (prefetch >= STARPU_IDLEFETCH)
															
 
																+		/* No possible actual change */
															
 
																+		return;
															
 
																+
															
 
																 	/* We have to promote chained_request too! */
															
 
																 	unsigned chained_req;
															
 
																 	for (chained_req = 0; chained_req < r->next_req_count; chained_req++)
															
@@ -852,19 +869,20 @@ void _starpu_update_prefetch_status(struct _starpu_data_request *r, unsigned pre
 
																 	STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[r->handling_node]);
															
 
																+	int found = 1;
															
 
																+
															
 
																 	/* The request can be in a different list (handling request or the temp list)
															
 
																-	 * we have to check that it is really in the prefetch list. */
															
 
																+	 * we have to check that it is really in the prefetch or idle list. */
															
 
																 	if (_starpu_data_request_prio_list_ismember(&prefetch_requests[r->handling_node], r))
															
 
																-	{
															
 
																-		_starpu_data_request_prio_list_erase(&prefetch_requests[r->handling_node],r);
															
 
																-		_starpu_data_request_prio_list_push_back(&data_requests[r->handling_node],r);
															
 
																-	}
															
 
																-	/* The request can be in a different list (handling request or the temp list)
															
 
																-	 * we have to check that it is really in the idle list. */
															
 
																+		_starpu_data_request_prio_list_erase(&prefetch_requests[r->handling_node], r);
															
 
																 	else if (_starpu_data_request_prio_list_ismember(&idle_requests[r->handling_node], r))
															
 
																+		_starpu_data_request_prio_list_erase(&idle_requests[r->handling_node], r);
															
 
																+	else
															
 
																+		found = 0;
															
 
																+
															
 
																+	if (found)
															
 
																 	{
															
 
																-		_starpu_data_request_prio_list_erase(&idle_requests[r->handling_node],r);
															
 
																-		if (prefetch == 1)
															
 
																+		if (prefetch > STARPU_FETCH)
															
 
																 			_starpu_data_request_prio_list_push_back(&prefetch_requests[r->handling_node],r);
															
 
																 		else
															
 
																 			_starpu_data_request_prio_list_push_back(&data_requests[r->handling_node],r);
															
--- a/src/datawizard/data_request.h
+++ b/src/datawizard/data_request.h
@@ -79,12 +79,11 @@ LIST_TYPE(_starpu_data_request,
 
																 	/** Whether the transfer is completed. */
															
 
																 	unsigned completed;
															
 
																-	/** Whether this is just a prefetch request:
															
 
																-	 * 0 for fetch,
															
 
																-	 * 1 for prefetch (dependencies have just been released)
															
 
																-	 * 2 for idle (a good idea to do it some time, but no hurry at all)
															
 
																-	 */
															
 
																-	unsigned prefetch;
															
 
																+	/** Whether this is just a prefetch request */
															
 
																+	enum _starpu_is_prefetch prefetch;
															
 
																+
															
 
																+	/** Number of tasks which used this as a prefetch */
															
 
																+	unsigned nb_tasks_prefetch;
															
 
																 	/** Priority of the request. Default is 0 */
															
 
																 	int prio;
															
@@ -151,7 +150,7 @@ struct _starpu_data_request *_starpu_create_data_request(starpu_data_handle_t ha
 
																 							 int handling_node,
															
 
																 							 enum starpu_data_access_mode mode,
															
 
																 							 unsigned ndeps,
															
 
																-							 unsigned is_prefetch,
															
 
																+							 enum _starpu_is_prefetch is_prefetch,
															
 
																 							 int prio,
															
 
																 							 unsigned is_write_invalidation,
															
 
																 							 const char *origin) STARPU_ATTRIBUTE_MALLOC;
															
@@ -162,5 +161,5 @@ void _starpu_data_request_append_callback(struct _starpu_data_request *r,
 
																 					  void (*callback_func)(void *),
															
 
																 					  void *callback_arg);
															
 
																-void _starpu_update_prefetch_status(struct _starpu_data_request *r, unsigned prefetch);
															
 
																+void _starpu_update_prefetch_status(struct _starpu_data_request *r, enum _starpu_is_prefetch prefetch);
															
 
																 #endif // __DATA_REQUEST_H__
															
--- a/src/datawizard/filters.c
+++ b/src/datawizard/filters.c
@@ -21,6 +21,7 @@
 
																 #include <datawizard/filters.h>
															
 
																 #include <datawizard/footprint.h>
															
 
																 #include <datawizard/interfaces/data_interface.h>
															
 
																+#include <datawizard/memory_nodes.h>
															
 
																 #include <core/task.h>
															
 
																 /*
															
@@ -192,7 +193,7 @@ static void _starpu_data_partition(starpu_data_handle_t initial_handle, starpu_d
 
																 		int home_node = initial_handle->home_node;
															
 
																 		if (home_node < 0 || (starpu_node_get_kind(home_node) != STARPU_CPU_RAM))
															
 
																 			home_node = STARPU_MAIN_RAM;
															
 
																-		int ret = _starpu_allocate_memory_on_node(initial_handle, &initial_handle->per_node[home_node], 0);
															
 
																+		int ret = _starpu_allocate_memory_on_node(initial_handle, &initial_handle->per_node[home_node], STARPU_FETCH);
															
 
																 #ifdef STARPU_DEVEL
															
 
																 #warning we should reclaim memory if allocation failed
															
 
																 #endif
															
--- a/src/datawizard/interfaces/bcsr_interface.c
+++ b/src/datawizard/interfaces/bcsr_interface.c
@@ -15,6 +15,9 @@
 
																  */
															
 
																 #include <starpu.h>
															
 
																+#ifdef BUILDING_STARPU
															
 
																+#include <datawizard/memory_nodes.h>
															
 
																+#endif
															
 
																 /*
															
 
																  * BCSR : blocked CSR, we use blocks of size (r x c)
															
--- a/src/datawizard/interfaces/block_interface.c
+++ b/src/datawizard/interfaces/block_interface.c
@@ -15,6 +15,9 @@
 
																  */
															
 
																 #include <starpu.h>
															
 
																+#ifdef BUILDING_STARPU
															
 
																+#include <datawizard/memory_nodes.h>
															
 
																+#endif
															
 
																 static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
															
--- a/src/datawizard/interfaces/coo_interface.c
+++ b/src/datawizard/interfaces/coo_interface.c
@@ -15,6 +15,9 @@
 
																  */
															
 
																 #include <starpu.h>
															
 
																+#ifdef BUILDING_STARPU
															
 
																+#include <datawizard/memory_nodes.h>
															
 
																+#endif
															
 
																 static int
															
 
																 copy_any_to_any(void *src_interface, unsigned src_node,
															
--- a/src/datawizard/interfaces/csr_interface.c
+++ b/src/datawizard/interfaces/csr_interface.c
@@ -16,6 +16,9 @@
 
																  */
															
 
																 #include <starpu.h>
															
 
																+#ifdef BUILDING_STARPU
															
 
																+#include <datawizard/memory_nodes.h>
															
 
																+#endif
															
 
																 static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
															
--- a/src/datawizard/interfaces/data_interface.c
+++ b/src/datawizard/interfaces/data_interface.c
@@ -693,7 +693,7 @@ void _starpu_check_if_valid_and_fetch_data_on_node(starpu_data_handle_t handle,
 
																 	}
															
 
																 	if (valid)
															
 
																 	{
															
 
																-		int ret = _starpu_fetch_data_on_node(handle, handle->home_node, replicate, STARPU_R, 0, 0, 0, NULL, NULL, 0, origin);
															
 
																+		int ret = _starpu_fetch_data_on_node(handle, handle->home_node, replicate, STARPU_R, 0, STARPU_FETCH, 0, NULL, NULL, 0, origin);
															
 
																 		STARPU_ASSERT(!ret);
															
 
																 		_starpu_release_data_on_node(handle, handle->home_node, replicate);
															
 
																 	}
															
--- a/src/datawizard/interfaces/matrix_interface.c
+++ b/src/datawizard/interfaces/matrix_interface.c
@@ -15,6 +15,9 @@
 
																  */
															
 
																 #include <starpu.h>
															
 
																+#ifdef BUILDING_STARPU
															
 
																+#include <datawizard/memory_nodes.h>
															
 
																+#endif
															
 
																 static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
															
--- a/src/datawizard/interfaces/multiformat_interface.c
+++ b/src/datawizard/interfaces/multiformat_interface.c
@@ -15,6 +15,9 @@
 
																  */
															
 
																 #include <starpu.h>
															
 
																+#ifdef BUILDING_STARPU
															
 
																+#include <datawizard/memory_nodes.h>
															
 
																+#endif
															
 
																 static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node);
															
 
																 #ifdef STARPU_USE_CUDA
															
--- a/src/datawizard/interfaces/tensor_interface.c
+++ b/src/datawizard/interfaces/tensor_interface.c
@@ -15,6 +15,9 @@
 
																  */
															
 
																 #include <starpu.h>
															
 
																+#ifdef BUILDING_STARPU
															
 
																+#include <datawizard/memory_nodes.h>
															
 
																+#endif
															
 
																 static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
															
--- a/src/datawizard/interfaces/variable_interface.c
+++ b/src/datawizard/interfaces/variable_interface.c
@@ -15,6 +15,9 @@
 
																  */
															
 
																 #include <starpu.h>
															
 
																+#ifdef BUILDING_STARPU
															
 
																+#include <datawizard/memory_nodes.h>
															
 
																+#endif
															
 
																 static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
															
--- a/src/datawizard/interfaces/vector_interface.c
+++ b/src/datawizard/interfaces/vector_interface.c
@@ -15,6 +15,9 @@
 
																  */
															
 
																 #include <starpu.h>
															
 
																+#ifdef BUILDING_STARPU
															
 
																+#include <datawizard/memory_nodes.h>
															
 
																+#endif
															
 
																 static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
															
--- a/src/datawizard/interfaces/void_interface.c
+++ b/src/datawizard/interfaces/void_interface.c
@@ -15,6 +15,9 @@
 
																  */
															
 
																 #include <starpu.h>
															
 
																+#ifdef BUILDING_STARPU
															
 
																+#include <datawizard/memory_nodes.h>
															
 
																+#endif
															
 
																 static int dummy_copy(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
															
--- a/src/datawizard/memalloc.c
+++ b/src/datawizard/memalloc.c
@@ -322,7 +322,7 @@ static int STARPU_ATTRIBUTE_WARN_UNUSED_RESULT transfer_subtree_to_node(starpu_d
 
																 		{
															
 
																 			/* This is the only copy, push it to destination */
															
 
																 			struct _starpu_data_request *r;
															
 
																-			r = _starpu_create_request_to_fetch_data(handle, dst_replicate, STARPU_R, 0, 0, NULL, NULL, 0, "transfer_subtree_to_node");
															
 
																+			r = _starpu_create_request_to_fetch_data(handle, dst_replicate, STARPU_R, STARPU_FETCH, 0, NULL, NULL, 0, "transfer_subtree_to_node");
															
 
																 			/* There is no way we don't need a request, since
															
 
																 			 * source is OWNER, destination can't be having it */
															
 
																 			STARPU_ASSERT(r);
															
@@ -546,7 +546,7 @@ static void reuse_mem_chunk(unsigned node, struct _starpu_data_replicate *new_re
 
																 /* This function is called for memory chunks that are possibly in used (ie. not
															
 
																  * in the cache). They should therefore still be associated to a handle. */
															
 
																 /* mc_lock is held and may be temporarily released! */
															
 
																-static size_t try_to_throw_mem_chunk(struct _starpu_mem_chunk *mc, unsigned node, struct _starpu_data_replicate *replicate, unsigned is_already_in_mc_list)
															
 
																+static size_t try_to_throw_mem_chunk(struct _starpu_mem_chunk *mc, unsigned node, struct _starpu_data_replicate *replicate, unsigned is_already_in_mc_list, enum _starpu_is_prefetch is_prefetch)
															
 
																 {
															
 
																 	size_t freed = 0;
															
@@ -571,6 +571,10 @@ static size_t try_to_throw_mem_chunk(struct _starpu_mem_chunk *mc, unsigned node
 
																 		/* Hasn't been used yet, avoid evicting it */
															
 
																 		return 0;
															
 
																+	if (mc->nb_tasks_prefetch && is_prefetch >= STARPU_TASK_PREFETCH)
															
 
																+		/* We have not finished executing the tasks this was prefetched for */
															
 
																+		return 0;
															
 
																+
															
 
																 	/* REDUX memchunk */
															
 
																 	if (mc->relaxed_coherency == 2)
															
 
																 	{
															
@@ -782,7 +786,7 @@ static int try_to_find_reusable_mc(unsigned node, starpu_data_handle_t data, str
 
																 /* this function looks for a memory chunk that matches a given footprint in the
															
 
																  * list of mem chunk that are not important */
															
 
																-static int try_to_reuse_not_important_mc(unsigned node, starpu_data_handle_t data, struct _starpu_data_replicate *replicate, uint32_t footprint)
															
 
																+static int try_to_reuse_not_important_mc(unsigned node, starpu_data_handle_t data, struct _starpu_data_replicate *replicate, uint32_t footprint, enum _starpu_is_prefetch is_prefetch)
															
 
																 {
															
 
																 	struct _starpu_mem_chunk *mc, *orig_next_mc, *next_mc;
															
 
																 	int success = 0;
															
@@ -816,7 +820,7 @@ restart:
 
																 		}
															
 
																 		/* Note: this may unlock mc_list! */
															
 
																-		success = try_to_throw_mem_chunk(mc, node, replicate, 1);
															
 
																+		success = try_to_throw_mem_chunk(mc, node, replicate, 1, is_prefetch);
															
 
																 		if (orig_next_mc)
															
 
																 		{
															
@@ -841,11 +845,14 @@ restart:
 
																  * Try to find a buffer currently in use on the memory node which has the given
															
 
																  * footprint.
															
 
																  */
															
 
																-static int try_to_reuse_potentially_in_use_mc(unsigned node, starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, uint32_t footprint, int is_prefetch)
															
 
																+static int try_to_reuse_potentially_in_use_mc(unsigned node, starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, uint32_t footprint, enum _starpu_is_prefetch is_prefetch)
															
 
																 {
															
 
																 	struct _starpu_mem_chunk *mc, *next_mc, *orig_next_mc;
															
 
																 	int success = 0;
															
 
																+	if (is_prefetch >= STARPU_IDLEFETCH)
															
 
																+		/* Do not evict a MC just for an idle fetch */
															
 
																+		return 0;
															
 
																 	/*
															
 
																 	 * We have to unlock mc_lock before locking header_lock, so we have
															
 
																 	 * to be careful with the list.  We try to do just one pass, by
															
@@ -868,14 +875,11 @@ restart:
 
																 		if (mc->remove_notify)
															
 
																 			/* Somebody already working here, skip */
															
 
																 			continue;
															
 
																-		if (is_prefetch > 1)
															
 
																-			/* Do not evict a MC just for an idle fetch */
															
 
																-			continue;
															
 
																-		if (is_prefetch == 1 && !mc->wontuse)
															
 
																+		if (!mc->wontuse && is_prefetch >= STARPU_PREFETCH)
															
 
																 			/* Do not evict something that we might reuse, just for a prefetch */
															
 
																-			/* FIXME: but perhaps we won't have any task using it in
															
 
																-                         * the close future, we should perhaps rather check
															
 
																-                         * mc->replicate->refcnt? */
															
 
																+			continue;
															
 
																+		if (mc->nb_tasks_prefetch && is_prefetch >= STARPU_TASK_PREFETCH)
															
 
																+			/* Do not evict something that we will reuse, just for a task prefetch */
															
 
																 			continue;
															
 
																 		if (mc->footprint != footprint || _starpu_data_interface_compare(handle->per_node[node].data_interface, handle->ops, mc->data->per_node[node].data_interface, mc->ops) != 1)
															
 
																 			/* Not the right type of interface, skip */
															
@@ -889,7 +893,7 @@ restart:
 
																 		}
															
 
																 		/* Note: this may unlock mc_list! */
															
 
																-		success = try_to_throw_mem_chunk(mc, node, replicate, 1);
															
 
																+		success = try_to_throw_mem_chunk(mc, node, replicate, 1, is_prefetch);
															
 
																 		if (orig_next_mc)
															
 
																 		{
															
@@ -999,7 +1003,7 @@ restart2:
 
																 				next_mc->remove_notify = &next_mc;
															
 
																 			}
															
 
																 			/* Note: this may unlock mc_list! */
															
 
																-			freed += try_to_throw_mem_chunk(mc, node, NULL, 0);
															
 
																+			freed += try_to_throw_mem_chunk(mc, node, NULL, 0, STARPU_FETCH);
															
 
																 			if (orig_next_mc)
															
 
																 			{
															
@@ -1218,7 +1222,7 @@ void starpu_memchunk_tidy(unsigned node)
 
																 			}
															
 
																 			_starpu_spin_unlock(&mc_lock[node]);
															
 
																-			if (!_starpu_create_request_to_fetch_data(handle, &handle->per_node[target_node], STARPU_R, 2, 1, NULL, NULL, 0, "starpu_memchunk_tidy"))
															
 
																+			if (!_starpu_create_request_to_fetch_data(handle, &handle->per_node[target_node], STARPU_R, STARPU_IDLEFETCH, 1, NULL, NULL, 0, "starpu_memchunk_tidy"))
															
 
																 			{
															
 
																 				/* No request was actually needed??
															
 
																 				 * Odd, but cope with it.  */
															
@@ -1317,6 +1321,7 @@ static struct _starpu_mem_chunk *_starpu_memchunk_init(struct _starpu_data_repli
 
																 	mc->size_interface = interface_size;
															
 
																 	mc->remove_notify = NULL;
															
 
																 	mc->diduse = 0;
															
 
																+	mc->nb_tasks_prefetch = 0;
															
 
																 	mc->wontuse = 0;
															
 
																 	return mc;
															
@@ -1430,7 +1435,7 @@ void _starpu_request_mem_chunk_removal(starpu_data_handle_t handle, struct _star
 
																  *
															
 
																  */
															
 
																-static starpu_ssize_t _starpu_allocate_interface(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, unsigned dst_node, unsigned is_prefetch)
															
 
																+static starpu_ssize_t _starpu_allocate_interface(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, unsigned dst_node, enum _starpu_is_prefetch is_prefetch)
															
 
																 {
															
 
																 	unsigned attempts = 0;
															
 
																 	starpu_ssize_t allocated_memory;
															
@@ -1514,7 +1519,7 @@ static starpu_ssize_t _starpu_allocate_interface(starpu_data_handle_t handle, st
 
																 			reclaim -= freed;
															
 
																 			/* Try to reuse an allocated data with the same interface (to avoid spurious free/alloc) */
															
 
																-			if (_starpu_has_not_important_data && try_to_reuse_not_important_mc(dst_node, handle, replicate, footprint))
															
 
																+			if (_starpu_has_not_important_data && try_to_reuse_not_important_mc(dst_node, handle, replicate, footprint, is_prefetch))
															
 
																 				break;
															
 
																 			if (try_to_reuse_potentially_in_use_mc(dst_node, handle, replicate, footprint, is_prefetch))
															
 
																 			{
															
@@ -1596,7 +1601,7 @@ out:
 
																 	return allocated_memory;
															
 
																 }
															
 
																-int _starpu_allocate_memory_on_node(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, unsigned is_prefetch)
															
 
																+int _starpu_allocate_memory_on_node(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, enum _starpu_is_prefetch is_prefetch)
															
 
																 {
															
 
																 	starpu_ssize_t allocated_memory;
															
--- a/src/datawizard/memalloc.h
+++ b/src/datawizard/memalloc.h
@@ -26,6 +26,7 @@
 
																 #include <datawizard/interfaces/data_interface.h>
															
 
																 #include <datawizard/coherency.h>
															
 
																 #include <datawizard/copy_driver.h>
															
 
																+#include <datawizard/data_request.h>
															
 
																 struct _starpu_data_replicate;
															
@@ -59,10 +60,17 @@ LIST_TYPE(_starpu_mem_chunk,
 
																 	/** Whether the memchunk is in the clean part of the mc_list */
															
 
																 	unsigned clean:1;
															
 
																 	/** Was this chunk used since it got allocated?  */
															
 
																+	/* FIXME: probably useless now with nb_tasks_prefetch */
															
 
																 	unsigned diduse:1;
															
 
																 	/** Was this chunk marked as "won't use"? */
															
 
																 	unsigned wontuse:1;
															
 
																+	/** The number of prefetches that we made for this mc for various tasks
															
 
																+	 * This is also the number of tasks that we will wait to see use this mc before
															
 
																+	 * we attempt to evict it.
															
 
																+	 */
															
 
																+	unsigned nb_tasks_prefetch;
															
 
																+
															
 
																 	/** the size of the data is only set when calling _starpu_request_mem_chunk_removal(),
															
 
																 	 * it is needed to estimate how much memory is in mc_cache, and by
															
 
																 	 * free_memory_on_node() which is called when the handle is no longer
															
@@ -84,7 +92,7 @@ void _starpu_init_mem_chunk_lists(void);
 
																 void _starpu_deinit_mem_chunk_lists(void);
															
 
																 void _starpu_mem_chunk_init_last(void);
															
 
																 void _starpu_request_mem_chunk_removal(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, unsigned node, size_t size);
															
 
																-int _starpu_allocate_memory_on_node(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, unsigned is_prefetch);
															
 
																+int _starpu_allocate_memory_on_node(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, enum _starpu_is_prefetch is_prefetch);
															
 
																 size_t _starpu_free_all_automatically_allocated_buffers(unsigned node);
															
 
																 void _starpu_memchunk_recently_used(struct _starpu_mem_chunk *mc, unsigned node);
															
 
																 void _starpu_memchunk_wont_use(struct _starpu_mem_chunk *m, unsigned nodec);
															
--- a/src/datawizard/memory_manager.c
+++ b/src/datawizard/memory_manager.c
@@ -19,6 +19,7 @@
 
																 #include <common/thread.h>
															
 
																 #include <common/fxt.h>
															
 
																 #include <datawizard/memory_manager.h>
															
 
																+#include <datawizard/memory_nodes.h>
															
 
																 #include <core/workers.h>
															
 
																 #include <starpu_stdlib.h>
															
--- a/src/datawizard/reduction.c
+++ b/src/datawizard/reduction.c
@@ -22,6 +22,7 @@
 
																 #include <datawizard/datawizard.h>
															
 
																 #include <drivers/mic/driver_mic_source.h>
															
 
																 #include <drivers/mp_common/source_common.h>
															
 
																+#include <datawizard/memory_nodes.h>
															
 
																 void starpu_data_set_reduction_methods(starpu_data_handle_t handle,
															
 
																 				       struct starpu_codelet *redux_cl,
															
--- a/src/datawizard/user_interactions.c
+++ b/src/datawizard/user_interactions.c
@@ -22,6 +22,7 @@
 
																 #include <datawizard/write_back.h>
															
 
																 #include <core/dependencies/data_concurrency.h>
															
 
																 #include <core/sched_policy.h>
															
 
																+#include <datawizard/memory_nodes.h>
															
 
																 static void _starpu_data_check_initialized(starpu_data_handle_t handle, enum starpu_data_access_mode mode)
															
 
																 {
															
@@ -46,7 +47,7 @@ int starpu_data_request_allocation(starpu_data_handle_t handle, unsigned node)
 
																 	_starpu_spin_lock(&handle->header_lock);
															
 
																-	r = _starpu_create_data_request(handle, NULL, &handle->per_node[node], node, STARPU_NONE, 0, 1, 0, 0, "starpu_data_request_allocation");
															
 
																+	r = _starpu_create_data_request(handle, NULL, &handle->per_node[node], node, STARPU_NONE, 0, STARPU_PREFETCH, 0, 0, "starpu_data_request_allocation");
															
 
																 	/* we do not increase the refcnt associated to the request since we are
															
 
																 	 * not waiting for its termination */
															
@@ -67,7 +68,7 @@ struct user_interaction_wrapper
 
																 	starpu_pthread_mutex_t lock;
															
 
																 	unsigned finished;
															
 
																 	unsigned detached;
															
 
																-	unsigned prefetch;
															
 
																+	enum _starpu_is_prefetch prefetch;
															
 
																 	unsigned async;
															
 
																 	int prio;
															
 
																 	void (*callback)(void *);
															
@@ -535,7 +536,7 @@ static void _prefetch_data_on_node(void *arg)
 
																 }
															
 
																 static
															
 
																-int _starpu_prefetch_data_on_node_with_mode(starpu_data_handle_t handle, unsigned node, unsigned async, enum starpu_data_access_mode mode, unsigned prefetch, int prio)
															
 
																+int _starpu_prefetch_data_on_node_with_mode(starpu_data_handle_t handle, unsigned node, unsigned async, enum starpu_data_access_mode mode, enum _starpu_is_prefetch prefetch, int prio)
															
 
																 {
															
 
																 	STARPU_ASSERT(handle);
															
@@ -595,12 +596,12 @@ int _starpu_prefetch_data_on_node_with_mode(starpu_data_handle_t handle, unsigne
 
																 int starpu_data_fetch_on_node(starpu_data_handle_t handle, unsigned node, unsigned async)
															
 
																 {
															
 
																-	return _starpu_prefetch_data_on_node_with_mode(handle, node, async, STARPU_R, 0, 0);
															
 
																+	return _starpu_prefetch_data_on_node_with_mode(handle, node, async, STARPU_R, STARPU_FETCH, 0);
															
 
																 }
															
 
																 int starpu_data_prefetch_on_node_prio(starpu_data_handle_t handle, unsigned node, unsigned async, int prio)
															
 
																 {
															
 
																-	return _starpu_prefetch_data_on_node_with_mode(handle, node, async, STARPU_R, 1, prio);
															
 
																+	return _starpu_prefetch_data_on_node_with_mode(handle, node, async, STARPU_R, STARPU_PREFETCH, prio);
															
 
																 }
															
 
																 int starpu_data_prefetch_on_node(starpu_data_handle_t handle, unsigned node, unsigned async)
															
@@ -610,7 +611,7 @@ int starpu_data_prefetch_on_node(starpu_data_handle_t handle, unsigned node, uns
 
																 int starpu_data_idle_prefetch_on_node_prio(starpu_data_handle_t handle, unsigned node, unsigned async, int prio)
															
 
																 {
															
 
																-	return _starpu_prefetch_data_on_node_with_mode(handle, node, async, STARPU_R, 2, prio);
															
 
																+	return _starpu_prefetch_data_on_node_with_mode(handle, node, async, STARPU_R, STARPU_IDLEFETCH, prio);
															
 
																 }
															
 
																 int starpu_data_idle_prefetch_on_node(starpu_data_handle_t handle, unsigned node, unsigned async)
															
--- a/src/datawizard/write_back.c
+++ b/src/datawizard/write_back.c
@@ -17,6 +17,7 @@
 
																 #include <datawizard/datawizard.h>
															
 
																 #include <datawizard/write_back.h>
															
 
																 #include <core/dependencies/data_concurrency.h>
															
 
																+#include <datawizard/memory_nodes.h>
															
 
																 static void wt_callback(void *arg)
															
 
																 {
															
@@ -63,7 +64,7 @@ void _starpu_write_through_data(starpu_data_handle_t handle, unsigned requesting
 
																 				struct _starpu_data_request *r;
															
 
																 				r = _starpu_create_request_to_fetch_data(handle, &handle->per_node[node],
															
 
																-									 STARPU_R, 1, 1, wt_callback, handle, 0, "_starpu_write_through_data");
															
 
																+									 STARPU_R, STARPU_IDLEFETCH, 1, wt_callback, handle, 0, "_starpu_write_through_data");
															
 
																 			        /* If no request was created, the handle was already up-to-date on the
															
 
																 			         * node */
															
--- a/src/debug/latency.c
+++ b/src/debug/latency.c
@@ -34,7 +34,7 @@ void _starpu_benchmark_ping_pong(starpu_data_handle_t handle,
 
																 		_starpu_spin_unlock(&handle->header_lock);
															
 
																 		struct _starpu_data_replicate *replicate_0 = &handle->per_node[node0];
															
 
																-		ret = _starpu_fetch_data_on_node(handle, node0, replicate_0, STARPU_RW, 0, 0, 0, NULL, NULL, 0, "_starpu_benchmark_ping_pong");
															
 
																+		ret = _starpu_fetch_data_on_node(handle, node0, replicate_0, STARPU_RW, 0, STARPU_FETCH, 0, NULL, NULL, 0, "_starpu_benchmark_ping_pong");
															
 
																 		STARPU_ASSERT(!ret);
															
 
																 		_starpu_release_data_on_node(handle, node0, replicate_0);
															
@@ -44,7 +44,7 @@ void _starpu_benchmark_ping_pong(starpu_data_handle_t handle,
 
																 		_starpu_spin_unlock(&handle->header_lock);
															
 
																 		struct _starpu_data_replicate *replicate_1 = &handle->per_node[node1];
															
 
																-		ret = _starpu_fetch_data_on_node(handle, node1, replicate_1, STARPU_RW, 0, 0, 0, NULL, NULL, 0, "_starpu_benchmark_ping_pong");
															
 
																+		ret = _starpu_fetch_data_on_node(handle, node1, replicate_1, STARPU_RW, 0, STARPU_FETCH, 0, NULL, NULL, 0, "_starpu_benchmark_ping_pong");
															
 
																 		STARPU_ASSERT(!ret);
															
 
																 		_starpu_release_data_on_node(handle, node1, replicate_1);
															
 
																 	}
															
--- a/src/debug/traces/starpu_fxt.c
+++ b/src/debug/traces/starpu_fxt.c
@@ -20,6 +20,7 @@
 
																 #include <starpu.h>
															
 
																 #include <common/config.h>
															
 
																 #include <common/uthash.h>
															
 
																+#include <datawizard/copy_driver.h>
															
 
																 #include <string.h>
															
 
																 #ifdef STARPU_HAVE_POTI
															
@@ -1194,8 +1195,8 @@ static void handle_new_mem_node(struct fxt_ev_64 *ev, struct starpu_fxt_options
 
																  */
															
 
																 static int create_ordered_stream_id (int nodeid, int devid)
															
 
																 {
															
 
																-	static int stable[MAX_MPI_NODES][STARPU_MAXCUDADEVS];
															
 
																-	STARPU_ASSERT(nodeid < MAX_MPI_NODES);
															
 
																+	static int stable[STARPU_FXT_MAX_FILES][STARPU_MAXCUDADEVS];
															
 
																+	STARPU_ASSERT(nodeid < STARPU_FXT_MAX_FILES);
															
 
																 	STARPU_ASSERT(devid < STARPU_MAXCUDADEVS);
															
 
																 	return stable[nodeid][devid]++;
															
 
																 }
															
@@ -2268,13 +2269,14 @@ static void handle_mpi_data_set_tag(struct fxt_ev_64 *ev, struct starpu_fxt_opti
 
																 	data->mpi_tag = tag;
															
 
																 }
															
 
																-static const char *copy_link_type(unsigned prefetch)
															
 
																+static const char *copy_link_type(enum _starpu_is_prefetch prefetch)
															
 
																 {
															
 
																 	switch (prefetch)
															
 
																 	{
															
 
																-		case 0: return "F";
															
 
																-		case 1: return "PF";
															
 
																-		case 2: return "IF";
															
 
																+		case STARPU_FETCH: return "F";
															
 
																+		case STARPU_TASK_PREFETCH: return "TF";
															
 
																+		case STARPU_PREFETCH: return "PF";
															
 
																+		case STARPU_IDLEFETCH: return "IF";
															
 
																 		default: STARPU_ASSERT(0);
															
 
																 	}
															
 
																 }
															
@@ -2285,7 +2287,7 @@ static void handle_start_driver_copy(struct fxt_ev_64 *ev, struct starpu_fxt_opt
 
																 	unsigned dst = ev->param[1];
															
 
																 	unsigned size = ev->param[2];
															
 
																 	unsigned comid = ev->param[3];
															
 
																-	unsigned prefetch = ev->param[4];
															
 
																+	enum _starpu_is_prefetch prefetch = ev->param[4];
															
 
																 	unsigned long handle = ev->param[5];
															
 
																 	const char *link_type = copy_link_type(prefetch);
															
@@ -2367,7 +2369,7 @@ static void handle_end_driver_copy(struct fxt_ev_64 *ev, struct starpu_fxt_optio
 
																 	unsigned dst = ev->param[1];
															
 
																 	unsigned long size = ev->param[2];
															
 
																 	unsigned comid = ev->param[3];
															
 
																-	unsigned prefetch = ev->param[4];
															
 
																+	enum _starpu_is_prefetch prefetch = ev->param[4];
															
 
																 	const char *link_type = copy_link_type(prefetch);
															
 
																 	char *prefix = options->file_prefix;
															
--- a/src/debug/traces/starpu_fxt.h
+++ b/src/debug/traces/starpu_fxt.h
@@ -41,8 +41,6 @@
 
																 #include <starpu.h>
															
 
																 #include "../../../include/starpu_fxt.h"
															
 
																-#define MAX_MPI_NODES 64
															
 
																-
															
 
																 extern char _starpu_last_codelet_symbol[STARPU_NMAXWORKERS][(FXT_MAX_PARAMS-5)*sizeof(unsigned long)];
															
 
																 void _starpu_fxt_dag_init(char *dag_filename);
															
--- a/src/debug/traces/starpu_fxt_mpi.c
+++ b/src/debug/traces/starpu_fxt_mpi.c
@@ -103,27 +103,27 @@ int _starpu_fxt_mpi_find_sync_point(char *filename_in, uint64_t *offset, int *ke
 
																  */
															
 
																 /* the list of MPI transfers found in the different traces */
															
 
																-static struct mpi_transfer *mpi_sends[MAX_MPI_NODES] = {NULL};
															
 
																-static struct mpi_transfer *mpi_recvs[MAX_MPI_NODES] = {NULL};
															
 
																+static struct mpi_transfer *mpi_sends[STARPU_FXT_MAX_FILES] = {NULL};
															
 
																+static struct mpi_transfer *mpi_recvs[STARPU_FXT_MAX_FILES] = {NULL};
															
 
																 /* number of available slots in the lists  */
															
 
																-unsigned mpi_sends_list_size[MAX_MPI_NODES] = {0};
															
 
																-unsigned mpi_recvs_list_size[MAX_MPI_NODES] = {0};
															
 
																+unsigned mpi_sends_list_size[STARPU_FXT_MAX_FILES] = {0};
															
 
																+unsigned mpi_recvs_list_size[STARPU_FXT_MAX_FILES] = {0};
															
 
																 /* number of slots actually used in the list  */
															
 
																-unsigned mpi_sends_used[MAX_MPI_NODES] = {0};
															
 
																-unsigned mpi_recvs_used[MAX_MPI_NODES] = {0};
															
 
																+unsigned mpi_sends_used[STARPU_FXT_MAX_FILES] = {0};
															
 
																+unsigned mpi_recvs_used[STARPU_FXT_MAX_FILES] = {0};
															
 
																 /* number of slots already matched at the beginning of the list. This permits
															
 
																  * going through the lists from the beginning to match each and every
															
 
																  * transfer, thus avoiding a quadratic complexity. */
															
 
																-unsigned mpi_recvs_matched[MAX_MPI_NODES][MAX_MPI_NODES] = { {0} };
															
 
																-unsigned mpi_sends_matched[MAX_MPI_NODES][MAX_MPI_NODES] = { {0} };
															
 
																+unsigned mpi_recvs_matched[STARPU_FXT_MAX_FILES][STARPU_FXT_MAX_FILES] = { {0} };
															
 
																+unsigned mpi_sends_matched[STARPU_FXT_MAX_FILES][STARPU_FXT_MAX_FILES] = { {0} };
															
 
																 void _starpu_fxt_mpi_add_send_transfer(int src, int dst STARPU_ATTRIBUTE_UNUSED, long mpi_tag, size_t size, float date, long jobid, unsigned long handle)
															
 
																 {
															
 
																 	STARPU_ASSERT(src >= 0);
															
 
																-	if (src >= MAX_MPI_NODES)
															
 
																+	if (src >= STARPU_FXT_MAX_FILES)
															
 
																 		return;
															
 
																 	unsigned slot = mpi_sends_used[src]++;
															
@@ -153,7 +153,7 @@ void _starpu_fxt_mpi_add_send_transfer(int src, int dst STARPU_ATTRIBUTE_UNUSED,
 
																 void _starpu_fxt_mpi_add_recv_transfer(int src STARPU_ATTRIBUTE_UNUSED, int dst, long mpi_tag, float date, long jobid, unsigned long handle)
															
 
																 {
															
 
																-	if (dst >= MAX_MPI_NODES)
															
 
																+	if (dst >= STARPU_FXT_MAX_FILES)
															
 
																 		return;
															
 
																 	unsigned slot = mpi_recvs_used[dst]++;
															
@@ -220,11 +220,11 @@ static unsigned long mpi_com_id = 0;
 
																 static void display_all_transfers_from_trace(FILE *out_paje_file, FILE *out_comms_file, unsigned n)
															
 
																 {
															
 
																-	unsigned slot[MAX_MPI_NODES] = { 0 }, node;
															
 
																+	unsigned slot[STARPU_FXT_MAX_FILES] = { 0 }, node;
															
 
																 	unsigned nb_wrong_comm_timing = 0;
															
 
																 	struct mpi_transfer_list pending_receives; /* Sorted list of matches which have not happened yet */
															
 
																-	double current_out_bandwidth[MAX_MPI_NODES] = { 0. };
															
 
																-	double current_in_bandwidth[MAX_MPI_NODES] = { 0. };
															
 
																+	double current_out_bandwidth[STARPU_FXT_MAX_FILES] = { 0. };
															
 
																+	double current_in_bandwidth[STARPU_FXT_MAX_FILES] = { 0. };
															
 
																 #ifdef STARPU_HAVE_POTI
															
 
																 	char mpi_container[STARPU_POTI_STR_LEN];
															
 
																 #endif
															
@@ -246,7 +246,7 @@ static void display_all_transfers_from_trace(FILE *out_paje_file, FILE *out_comm
 
																 		else
															
 
																 			start_date = mpi_transfer_list_front(&pending_receives)->date;
															
 
																-		src = MAX_MPI_NODES;
															
 
																+		src = STARPU_FXT_MAX_FILES;
															
 
																 		for (node = 0; node < n; node++)
															
 
																 		{
															
 
																 			if (slot[node] < mpi_sends_used[node] && mpi_sends[node][slot[node]].date < start_date)
															
@@ -260,7 +260,7 @@ static void display_all_transfers_from_trace(FILE *out_paje_file, FILE *out_comm
 
																 			/* No event any more, we're finished! */
															
 
																 			break;
															
 
																-		if (src == MAX_MPI_NODES)
															
 
																+		if (src == STARPU_FXT_MAX_FILES)
															
 
																 		{
															
 
																 			/* Pending match is earlier than all new sends, finish its communication */
															
 
																 			match = mpi_transfer_list_pop_front(&pending_receives);
															
@@ -284,7 +284,7 @@ static void display_all_transfers_from_trace(FILE *out_paje_file, FILE *out_comm
 
																 		size_t size = cur->size;
															
 
																 		unsigned long send_handle = cur->handle;
															
 
																-		if (dst < MAX_MPI_NODES)
															
 
																+		if (dst < STARPU_FXT_MAX_FILES)
															
 
																 			match = try_to_match_send_transfer(src, dst, mpi_tag);
															
 
																 		else
															
 
																 			match = NULL;
															
@@ -377,10 +377,10 @@ static void display_all_transfers_from_trace(FILE *out_paje_file, FILE *out_comm
 
																 void _starpu_fxt_display_mpi_transfers(struct starpu_fxt_options *options, int *ranks STARPU_ATTRIBUTE_UNUSED, FILE *out_paje_file, FILE* out_comms_file)
															
 
																 {
															
 
																-	if (options->ninputfiles > MAX_MPI_NODES)
															
 
																+	if (options->ninputfiles > STARPU_FXT_MAX_FILES)
															
 
																 	{
															
 
																-		_STARPU_DISP("Warning: %u files given, maximum %u supported, truncating to %u\n", options->ninputfiles, MAX_MPI_NODES, MAX_MPI_NODES);
															
 
																-		options->ninputfiles = MAX_MPI_NODES;
															
 
																+		_STARPU_DISP("Warning: %u files given, maximum %u supported, truncating to %u\n", options->ninputfiles, STARPU_FXT_MAX_FILES, STARPU_FXT_MAX_FILES);
															
 
																+		options->ninputfiles = STARPU_FXT_MAX_FILES;
															
 
																 	}
															
 
																 	/* display the MPI transfers if possible */
															
--- a/src/debug/traces/starpu_paje.c
+++ b/src/debug/traces/starpu_paje.c
@@ -398,6 +398,7 @@ void _starpu_fxt_write_paje_header(FILE *file STARPU_ATTRIBUTE_UNUSED, struct st
 
																 	/* Link types */
															
 
																 	poti_DefineLinkType("MPIL", "MPIP", "MPICt", "MPICt", "MPI communication");
															
 
																 	poti_DefineLinkType("F", "P", "Mm", "Mm", "Intra-node data Fetch");
															
 
																+	poti_DefineLinkType("TF", "P", "Mm", "Mm", "Intra-node data TaskPreFetch");
															
 
																 	poti_DefineLinkType("PF", "P", "Mm", "Mm", "Intra-node data PreFetch");
															
 
																 	poti_DefineLinkType("IF", "P", "Mm", "Mm", "Intra-node data IdleFetch");
															
 
																 	poti_DefineLinkType("WSL", "P", "W", "W", "Work steal");
															
@@ -551,6 +552,7 @@ void _starpu_fxt_write_paje_header(FILE *file STARPU_ATTRIBUTE_UNUSED, struct st
 
																 6       No       MS     Nothing         \".0 .0 .0\"		\n\
															
 
																 5       MPIL     MPIP	MPICt	MPICt   \"MPI communication\"\n\
															
 
																 5       F       P	Mm	Mm      \"Intra-node data Fetch\"\n\
															
 
																+5       TF      P	Mm	Mm      \"Intra-node data TaskPreFetch\"\n\
															
 
																 5       PF      P	Mm	Mm      \"Intra-node data PreFetch\"\n\
															
 
																 5       IF      P	Mm	Mm      \"Intra-node data IdleFetch\"\n\
															
 
																 5       WSL     P	W	W       \"Work steal\"\n");
															
--- a/src/drivers/cpu/driver_cpu.c
+++ b/src/drivers/cpu/driver_cpu.c
@@ -109,7 +109,10 @@ static int execute_job_on_cpu(struct _starpu_job *j, struct starpu_task *worker_
 
																 				_SIMGRID_TIMER_END;
															
 
																 			}
															
 
																 			else
															
 
																-				_starpu_simgrid_submit_job(cpu_args->workerid, j, perf_arch, NAN, NULL);
															
 
																+			{
															
 
																+				struct _starpu_sched_ctx *sched_ctx = _starpu_sched_ctx_get_sched_ctx_for_worker_and_job(cpu_args, j);
															
 
																+				_starpu_simgrid_submit_job(cpu_args->workerid, sched_ctx->id, j, perf_arch, NAN, NAN, NULL);
															
 
																+			}
															
 
																 #else
															
 
																 #  ifdef STARPU_PAPI
															
 
																 			_starpu_profiling_papi_task_start_counters(task);
															
--- a/src/drivers/cuda/driver_cuda.c
+++ b/src/drivers/cuda/driver_cuda.c
@@ -531,10 +531,13 @@ static int start_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *worke
 
																 				_SIMGRID_TIMER_END;
															
 
																 			}
															
 
																 		else
															
 
																-			_starpu_simgrid_submit_job(workerid, j, &worker->perf_arch, NAN,
															
 
																+		{
															
 
																+			struct _starpu_sched_ctx *sched_ctx = _starpu_sched_ctx_get_sched_ctx_for_worker_and_job(worker, j);
															
 
																+			_starpu_simgrid_submit_job(workerid, sched_ctx->id, j, &worker->perf_arch, NAN, NAN,
															
 
																 				async ? &task_finished[workerid][pipeline_idx] : NULL);
															
 
																+		}
															
 
																 #else
															
 
																-#ifdef HAVE_LIBNVIDIA_ML
															
 
																+#ifdef HAVE_NVMLDEVICEGETTOTALENERGYCONSUMPTION
															
 
																 		unsigned long long energy_start = 0;
															
 
																 		nvmlReturn_t nvmlRet = -1;
															
 
																 		if (profiling && task->profiling_info)
															
@@ -558,7 +561,7 @@ static void finish_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *wor
 
																 	int profiling = starpu_profiling_status_get();
															
 
																-#ifdef HAVE_LIBNVIDIA_ML
															
 
																+#ifdef HAVE_NVMLDEVICEGETTOTALENERGYCONSUMPTION
															
 
																 	if (profiling && j->task->profiling_info && j->task->profiling_info->energy_consumed)
															
 
																 	{
															
 
																 		unsigned long long energy_end;
															
@@ -880,27 +883,33 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 
																 			_starpu_set_local_worker_key(worker);
															
 
																 			finish_job_on_cuda(_starpu_get_job_associated_to_task(task), worker);
															
 
																 			/* See next task if any */
															
 
																-			if (worker->ntasks && worker->current_tasks[worker->first_task] != worker->task_transferring)
															
 
																+			if (worker->ntasks)
															
 
																 			{
															
 
																-				task = worker->current_tasks[worker->first_task];
															
 
																-				j = _starpu_get_job_associated_to_task(task);
															
 
																-				if (task->cl->cuda_flags[j->nimpl] & STARPU_CUDA_ASYNC)
															
 
																+				if (worker->current_tasks[worker->first_task] != worker->task_transferring)
															
 
																 				{
															
 
																-					/* An asynchronous task, it was already
															
 
																-					 * queued, it's now running, record its start time.  */
															
 
																-					_starpu_driver_start_job(worker, j, &worker->perf_arch, 0, starpu_profiling_status_get());
															
 
																+					task = worker->current_tasks[worker->first_task];
															
 
																+					j = _starpu_get_job_associated_to_task(task);
															
 
																+					if (task->cl->cuda_flags[j->nimpl] & STARPU_CUDA_ASYNC)
															
 
																+					{
															
 
																+						/* An asynchronous task, it was already
															
 
																+						 * queued, it's now running, record its start time.  */
															
 
																+						_starpu_driver_start_job(worker, j, &worker->perf_arch, 0, starpu_profiling_status_get());
															
 
																+					}
															
 
																+					else
															
 
																+					{
															
 
																+						/* A synchronous task, we have finished
															
 
																+						 * flushing the pipeline, we can now at
															
 
																+						 * last execute it.  */
															
 
																+
															
 
																+						_STARPU_TRACE_EVENT("sync_task");
															
 
																+						execute_job_on_cuda(task, worker);
															
 
																+						_STARPU_TRACE_EVENT("end_sync_task");
															
 
																+						worker->pipeline_stuck = 0;
															
 
																+					}
															
 
																 				}
															
 
																 				else
															
 
																-				{
															
 
																-					/* A synchronous task, we have finished
															
 
																-					 * flushing the pipeline, we can now at
															
 
																-					 * last execute it.  */
															
 
																-
															
 
																-					_STARPU_TRACE_EVENT("sync_task");
															
 
																-					execute_job_on_cuda(task, worker);
															
 
																-					_STARPU_TRACE_EVENT("end_sync_task");
															
 
																-					worker->pipeline_stuck = 0;
															
 
																-				}
															
 
																+					/* Data for next task didn't have time to finish transferring :/ */
															
 
																+					_STARPU_TRACE_WORKER_START_FETCH_INPUT(NULL, workerid);
															
 
																 			}
															
 
																 #ifdef STARPU_USE_FXT
															
 
																 			int k;
															
--- a/src/drivers/disk/driver_disk.c
+++ b/src/drivers/disk/driver_disk.c
@@ -21,6 +21,7 @@
 
																 #include <drivers/disk/driver_disk.h>
															
 
																 #include <drivers/cpu/driver_cpu.h>
															
 
																 #include <datawizard/coherency.h>
															
 
																+#include <datawizard/memory_nodes.h>
															
 
																 int _starpu_disk_copy_src_to_disk(void * src, unsigned src_node, void * dst, size_t dst_offset, unsigned dst_node, size_t size, void * async_channel)
															
 
																 {
															
--- a/src/drivers/driver_common/driver_common.c
+++ b/src/drivers/driver_common/driver_common.c