5 years ago · 584a70d891
--- a/ChangeLog
+++ b/ChangeLog
@@ -31,9 +31,20 @@ New features:
 
				     files. This file can be parsed by the new script
			
 
				     starpu_fxt_number_events_to_names.py to convert event keys to event names.
			
 
				   * New STARPU_PER_WORKER perfmodel.
			
 
				+  * Add energy accounting in the simgrid mode: starpu_energy_use() and
			
 
				+    starpu_energy_used().
			
 
				+  * New function starpu_mpi_get_thread_cpuid() to know where is bound the MPI
			
 
				+    thread.
			
 
				+  * New function starpu_get_pu_os_index() to convert logical index of a PU to
			
 
				+    its OS index.
			
 
				+  * New function starpu_get_hwloc_topology() to get the hwloc topology used by
			
 
				+    StarPU.
			
 
				+  * Add a task prefetch level, to improve retaining data in accelerators so we
			
 
				+    can make prefetch more aggressive.
			
 
				 
			
 
				 Small changes:
			
 
				   * Use the S4U interface of Simgrid instead of xbt and MSG.
			
 
				+  * Add a synthetic energy efficiency testcase.
			
 
				 
			
 
				 StarPU 1.3.4 (git revision xxx)
			
 
				 ==============================================
			
--- a/configure.ac
+++ b/configure.ac
@@ -1459,6 +1459,9 @@ if test x$enable_cuda = xyes; then
 
				 	    ]
			
 
				 	)
			
 
				 	if test x$have_valid_nvml = xyes ; then
			
 
				+		AC_CHECK_DECLS([nvmlDeviceGetTotalEnergyConsumption], [
			
 
				+			AC_CHECK_FUNCS([nvmlDeviceGetTotalEnergyConsumption])
			
 
				+			], [], [[#include <nvml.h>]])
			
 
				 		AC_DEFINE([HAVE_LIBNVIDIA_ML], [1], [Define to 1 if you have the nvidia-ml library])
			
 
				 		STARPU_CUDA_LDFLAGS="$STARPU_CUDA_LDFLAGS -lnvidia-ml"
			
 
				 	fi
			
@@ -2321,6 +2324,14 @@ AC_MSG_RESULT($nmaxbuffers)
 
				 AC_DEFINE_UNQUOTED(STARPU_NMAXBUFS, [$nmaxbuffers],
			
 
				 		[how many buffers can be manipulated per task])
			
 
				 
			
 
				+AC_MSG_CHECKING(how many MPI nodes fxt files can be manipulated when generating traces)
			
 
				+AC_ARG_ENABLE(fxt-max-files, [AS_HELP_STRING([--enable-fxt-max-files=<nbuffers>],
			
 
				+			[maximum number of mpi nodes for traces])],
			
 
				+			nmaxfxtfiles=$enableval, nmaxfxtfiles=64)
			
 
				+AC_MSG_RESULT($nmaxfxtfiles)
			
 
				+AC_DEFINE_UNQUOTED(STARPU_FXT_MAX_FILES, [$nmaxfxtfiles],
			
 
				+		[how many MPI nodes fxt files can be manipulated when generating traces])
			
 
				+
			
 
				 AC_MSG_CHECKING(maximum number of memory nodes to use per MPI rank)
			
 
				 AC_ARG_ENABLE(maxnodes, [AS_HELP_STRING([--enable-maxnodes=<nnodes>],
			
 
				 			[maximum number of memory nodes per MPI rank])],
			
@@ -2645,7 +2656,7 @@ if test "x$enable_build_fortran_requested" = "xyes" ; then
 
				 	fi
			
 
				 	if test "x$enable_build_fortran" = "xyes" ; then
			
 
				 		AC_DEFINE(STARPU_HAVE_FC, [1], [Define this if a Fortran compiler is available])
			
 
				-		if test x$build_mpi_lib = xyes -o x$build_mpi_master_slave = xyes ; then
			
 
				+		if test x$build_mpi_lib = xyes -o x$build_nmad_lib = xyes -o x$build_mpi_master_slave = xyes ; then
			
 
				 			#Check MPIFORT
			
 
				 			if test x$enable_simgrid = xyes ; then
			
 
				 				DEFAULT_MPIFORT=smpifort
			
@@ -3620,6 +3631,9 @@ AC_CONFIG_COMMANDS([executable-scripts], [
 
				   test -e tests/microbenchs/parallel_independent_homogeneous_tasks.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_independent_homogeneous_tasks.sh tests/microbenchs/
			
 
				   test -e tests/microbenchs/parallel_redux_homogeneous_tasks_data.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_redux_homogeneous_tasks_data.sh tests/microbenchs/
			
 
				   test -e tests/microbenchs/parallel_redux_heterogeneous_tasks_data.sh || ln -sf $ac_abs_top_srcdir/tests/microbenchs/parallel_redux_heterogeneous_tasks_data.sh tests/microbenchs/
			
 
				+  mkdir -p tests/energy
			
 
				+  test -e tests/energy/static.sh || ln -sf $ac_abs_top_srcdir/tests/energy/static.sh tests/energy/
			
 
				+  test -e tests/energy/dynamic.sh || ln -sf $ac_abs_top_srcdir/tests/energy/dynamic.sh tests/energy/
			
 
				   mkdir -p tests/datawizard
			
 
				   test -e tests/datawizard/locality.sh || ln -sf $ac_abs_top_srcdir/tests/datawizard/locality.sh tests/datawizard/
			
 
				   mkdir -p tests/overlap
			
--- a/doc/doxygen/Makefile.am
+++ b/doc/doxygen/Makefile.am
@@ -307,5 +307,5 @@ EXTRA_DIST += doxygen.cfg refman.tex \
 
				 # Rule to update documentation on web server. Should only be used locally.
			
 
				 PUBLISHHOST	?= gforge
			
 
				 update-web: $(DOX_PDF)
			
 
				-	scp -pr starpu.pdf html $(PUBLISHHOST):/home/groups/starpu/htdocs/doc
			
 
				+	scp -pr starpu.pdf html $(PUBLISHHOST):/home/groups/starpu/htdocs/files/doc
			
 
				 
			
--- a/doc/doxygen/chapters/210_check_list_performance.doxy
+++ b/doc/doxygen/chapters/210_check_list_performance.doxy
@@ -91,6 +91,8 @@ operations to avoid this issue. For instance:
 
				 
			
 
				 \code{.c}
			
 
				 func <<<grid,block,0,starpu_cuda_get_local_stream()>>> (foo, bar);
			
 
				+cudaError_t status = cudaGetLastError();
			
 
				+if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
			
 
				 cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 \endcode
			
 
				 
			
--- a/doc/doxygen/chapters/301_tasks.doxy
+++ b/doc/doxygen/chapters/301_tasks.doxy
@@ -118,7 +118,7 @@ to delay the termination of a task until the termination of other tasks.
 
				 
			
 
				 \section SettingManyDataHandlesForATask Setting Many Data Handles For a Task
			
 
				 
			
 
				-The maximum number of data a task can manage is fixed by the environment variable
			
 
				+The maximum number of data a task can manage is fixed by the macro
			
 
				 \ref STARPU_NMAXBUFS which has a default value which can be changed
			
 
				 through the \c configure option \ref enable-maxbuffers "--enable-maxbuffers".
			
 
				 
			
--- a/doc/doxygen/chapters/320_scheduling.doxy
+++ b/doc/doxygen/chapters/320_scheduling.doxy
@@ -205,7 +205,9 @@ simply tend to run all computations on the most energy-conservative processing
 
				 unit. To account for the consumption of the whole machine (including idle
			
 
				 processing units), the idle power of the machine should be given by setting
			
 
				 <c>export STARPU_IDLE_POWER=200</c> (\ref STARPU_IDLE_POWER) for 200W, for instance. This value can often
			
 
				-be obtained from the machine power supplier.
			
 
				+be obtained from the machine power supplier, e.g. by running
			
 
				+
			
 
				+<c>ipmitool -I lanplus -H mymachine-ipmi -U myuser -P mypasswd sdr type Current</c>
			
 
				 
			
 
				 The energy actually consumed by the total execution can be displayed by setting
			
 
				 <c>export STARPU_PROFILING=1 STARPU_WORKER_STATS=1</c> (\ref STARPU_PROFILING and \ref STARPU_WORKER_STATS).
			
--- a/doc/doxygen/chapters/350_scheduling_policy_definition.doxy
+++ b/doc/doxygen/chapters/350_scheduling_policy_definition.doxy
@@ -60,6 +60,9 @@ queue the transfers on the idle prefetch queue, which is only processed when
 
				 there are no non-idle prefetch to process.
			
 
				 starpu_get_prefetch_flag() is a convenient helper for checking the value of the 
			
 
				 \ref STARPU_PREFETCH environment variable.
			
 
				+When a scheduler does such prefetching, it should set the <c>prefetches</c>
			
 
				+field of the <c>starpu_sched_policy</c> to 1, to prevent the core from
			
 
				+triggering its own prefetching.
			
 
				 
			
 
				 Usual functions can be used on tasks, for instance one can use the following to
			
 
				 get the data size for a task.
			
--- a/doc/doxygen/chapters/510_configure_options.doxy
+++ b/doc/doxygen/chapters/510_configure_options.doxy
@@ -527,6 +527,15 @@ Define the maximum number of buffers that tasks will be able to take
 
				 as parameters, then available as the macro ::STARPU_NMAXBUFS.
			
 
				 </dd>
			
 
				 
			
 
				+<dt>--enable-fxt-max-files=<c>count</c></dt>
			
 
				+<dd>
			
 
				+\anchor enable-fxt-max-files
			
 
				+\addindex __configure__--enable-fxt-max-files
			
 
				+Use at most <c>count</c> mpi nodes fxt files for generating traces.  This information is then available as
			
 
				+the macro ::STARPU_FXT_MAX_FILES.  This information is used by FxT tools when considering multi node traces.
			
 
				+Default value is 64.
			
 
				+</dd>
			
 
				+
			
 
				 <dt>--enable-allocation-cache</dt>
			
 
				 <dd>
			
 
				 \anchor enable-allocation-cache
			
--- a/doc/doxygen/chapters/code/vector_scal_cuda.c
+++ b/doc/doxygen/chapters/code/vector_scal_cuda.c
@@ -35,6 +35,8 @@ extern "C" void scal_cuda_func(void *buffers[], void *_args)
 
				         unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
			
 
				 
			
 
				         vector_mult_cuda<<<nblocks,threads_per_block, 0, starpu_cuda_get_local_stream()>>>(n, val, *factor);
			
 
				+        cudaError_t status = cudaGetLastError();
			
 
				+        if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
			
 
				 
			
 
				         cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 }
			
--- a/doc/doxygen_dev/Makefile.am
+++ b/doc/doxygen_dev/Makefile.am
@@ -248,5 +248,5 @@ EXTRA_DIST += doxygen.cfg refman.tex \
 
				 # Rule to update documentation on web server. Should only be used locally.
			
 
				 PUBLISHHOST	?= gforge
			
 
				 update-web: $(DOX_PDF)
			
 
				-	scp -pr starpu_dev.pdf html_dev $(PUBLISHHOST):/home/groups/starpu/htdocs/doc
			
 
				+	scp -pr starpu_dev.pdf html_dev $(PUBLISHHOST):/home/groups/starpu/htdocs/files/doc
			
 
				 
			
--- a/doc/tutorial/vector_scal_cuda.cu
+++ b/doc/tutorial/vector_scal_cuda.cu
@@ -35,6 +35,8 @@ extern "C" void vector_scal_cuda(void *buffers[], void *_args)
 
				         unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
			
 
				 
			
 
				         vector_mult_cuda<<<nblocks,threads_per_block, 0, starpu_cuda_get_local_stream()>>>(val, n, *factor);
			
 
				+        cudaError_t status = cudaGetLastError();
			
 
				+        if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
			
 
				 
			
 
				         cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 }
			
--- a/examples/basic_examples/block_cuda.cu
+++ b/examples/basic_examples/block_cuda.cu
@@ -40,5 +40,7 @@ extern "C" void cuda_codelet(void *descr[], void *_args)
 
				         float *multiplier = (float *)_args;
			
 
				 
			
 
				         cuda_block<<<1,1, 0, starpu_cuda_get_local_stream()>>>(block, nx, ny, nz, ldy, ldz, *multiplier);
			
 
				+        cudaError_t status = cudaGetLastError();
			
 
				+        if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
			
 
				 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 }
			
--- a/examples/basic_examples/multiformat_conversion_codelets_cuda.cu
+++ b/examples/basic_examples/multiformat_conversion_codelets_cuda.cu
@@ -44,4 +44,6 @@ extern "C" void cpu_to_cuda_cuda_func(void *buffers[], void *_args)
 
				 	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
			
 
				 
			
 
				         cpu_to_cuda_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>>(src, dst, n);
			
 
				+        cudaError_t status = cudaGetLastError();
			
 
				+        if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
			
 
				 }
			
--- a/examples/basic_examples/multiformat_cuda.cu
+++ b/examples/basic_examples/multiformat_cuda.cu
@@ -39,6 +39,8 @@ extern "C" void multiformat_scal_cuda_func(void *buffers[], void *_args)
 
				 	unsigned threads_per_block = 64;
			
 
				 	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
			
 
				         multiformat_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>>(soa, n);
			
 
				+        cudaError_t status = cudaGetLastError();
			
 
				+        if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
			
 
				 
			
 
				 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 }
			
--- a/examples/basic_examples/variable_kernels.cu
+++ b/examples/basic_examples/variable_kernels.cu
@@ -27,5 +27,7 @@ extern "C" void cuda_codelet(void *descr[], void *_args)
 
				 	float *val = (float *)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				 
			
 
				 	cuda_variable<<<1,1, 0, starpu_cuda_get_local_stream()>>>(val);
			
 
				+	cudaError_t status = cudaGetLastError();
			
 
				+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
			
 
				 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 }
			
--- a/examples/basic_examples/vector_scal_cuda.cu
+++ b/examples/basic_examples/vector_scal_cuda.cu
@@ -41,4 +41,6 @@ extern "C" void scal_cuda_func(void *buffers[], void *_args)
 
				 	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
			
 
				 
			
 
				         vector_mult_cuda<<<nblocks,threads_per_block,0,starpu_cuda_get_local_stream()>>>(n, val, *factor);
			
 
				+        cudaError_t status = cudaGetLastError();
			
 
				+        if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
			
 
				 }
			
--- a/examples/filters/custom_mf/conversion.cu
+++ b/examples/filters/custom_mf/conversion.cu
@@ -45,4 +45,6 @@ extern "C" void cpu_to_cuda_cuda_func(void *buffers[], void *_args)
 
				 	unsigned threads_per_block = 64;
			
 
				 	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
			
 
				         custom_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>>(aop, n, x, y);
			
 
				+        cudaError_t status = cudaGetLastError();
			
 
				+        if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
			
 
				 }
			
--- a/examples/filters/custom_mf/cuda.cu
+++ b/examples/filters/custom_mf/cuda.cu
@@ -39,4 +39,6 @@ extern "C" void custom_scal_cuda_func(void *buffers[], void *_args)
 
				 	unsigned threads_per_block = 64;
			
 
				 	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
			
 
				         scal_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>>(n, x, y);
			
 
				+        cudaError_t status = cudaGetLastError();
			
 
				+        if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
			
 
				 }
			
--- a/examples/filters/fblock_cuda.cu
+++ b/examples/filters/fblock_cuda.cu
@@ -43,4 +43,6 @@ extern "C" void cuda_func(void *buffers[], void *_args)
 
				 
			
 
				         /* TODO: use more blocks and threads in blocks */
			
 
				         fblock_cuda<<<1,1, 0, starpu_cuda_get_local_stream()>>>(block, nx, ny, nz, ldy, ldz, *factor);
			
 
				+        cudaError_t status = cudaGetLastError();
			
 
				+        if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
			
 
				 }
			
--- a/examples/filters/fmultiple_cuda.cu
+++ b/examples/filters/fmultiple_cuda.cu
@@ -44,6 +44,8 @@ extern "C" void fmultiple_check_scale_cuda(void *buffers[], void *cl_arg)
 
				 
			
 
				         /* TODO: use more vals and threads in vals */
			
 
				 	_fmultiple_check_scale_cuda<<<1,1, 0, starpu_cuda_get_local_stream()>>>(val, nx, ny, ld, start, factor);
			
 
				+	cudaError_t status = cudaGetLastError();
			
 
				+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
			
 
				 }
			
 
				 
			
 
				 static __global__ void _fmultiple_check_cuda(int *val, int nx, int ny, unsigned ld, int start, int factor)
			
@@ -71,4 +73,6 @@ extern "C" void fmultiple_check_cuda(void *buffers[], void *cl_arg)
 
				 
			
 
				         /* TODO: use more vals and threads in vals */
			
 
				 	_fmultiple_check_cuda<<<1,1, 0, starpu_cuda_get_local_stream()>>>(val, nx, ny, ld, start, factor);
			
 
				+	cudaError_t status = cudaGetLastError();
			
 
				+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
			
 
				 }
			
--- a/examples/incrementer/incrementer_kernels.cu
+++ b/examples/incrementer/incrementer_kernels.cu
@@ -32,4 +32,6 @@ extern "C" void cuda_codelet(void *descr[], void *_args)
 
				 	float *val = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				 
			
 
				 	cuda_incrementer<<<1,1, 0, starpu_cuda_get_local_stream()>>>(val);
			
 
				+	cudaError_t status = cudaGetLastError();
			
 
				+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
			
 
				 }
			
--- a/examples/interface/complex_kernels.cu
+++ b/examples/interface/complex_kernels.cu
@@ -44,4 +44,6 @@ extern "C" void copy_complex_codelet_cuda(void *descr[], void *_args)
 
				 	unsigned nblocks = (nx + threads_per_block-1) / threads_per_block;
			
 
				 
			
 
				         complex_copy_cuda<<<nblocks, threads_per_block, 0, starpu_cuda_get_local_stream()>>>(o_real, o_imaginary, i_real, i_imaginary, nx);
			
 
				+        cudaError_t status = cudaGetLastError();
			
 
				+        if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
			
 
				 }
			
--- a/examples/mult/double.h
+++ b/examples/mult/double.h
@@ -15,6 +15,7 @@
 
				  */
			
 
				 
			
 
				 #define TYPE	double
			
 
				+#define EPSILON	0.000000000001
			
 
				 
			
 
				 #define CUBLAS_GEMM cublasDgemm
			
 
				 #define CPU_GEMM	STARPU_DGEMM
			
--- a/examples/mult/simple.h
+++ b/examples/mult/simple.h
@@ -15,6 +15,7 @@
 
				  */
			
 
				 
			
 
				 #define TYPE	float
			
 
				+#define EPSILON	0.000001
			
 
				 
			
 
				 #define CUBLAS_GEMM cublasSgemm
			
 
				 #define CPU_GEMM	STARPU_SGEMM
			
--- a/examples/mult/xgemm.c
+++ b/examples/mult/xgemm.c
@@ -75,7 +75,7 @@ static int check_output(void)
 
				 	TYPE err;
			
 
				 	err = CPU_ASUM(xdim*ydim, C, 1);
			
 
				 
			
 
				-	if (err < xdim*ydim*0.001)
			
 
				+	if (err < EPSILON*xdim*ydim*zdim)
			
 
				 	{
			
 
				 		FPRINTF(stderr, "Results are OK\n");
			
 
				 		return 0;
			
--- a/examples/pi/SobolQRNG/sobol_gpu.cu
+++ b/examples/pi/SobolQRNG/sobol_gpu.cu
@@ -165,4 +165,6 @@ extern "C" void sobolGPU(int n_vectors, int n_dimensions, unsigned int *d_direct
 
				 
			
 
				     // Execute GPU kernel
			
 
				     sobolGPU_kernel<<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>>(n_vectors, n_dimensions, d_directions, d_output);
			
 
				+    cudaError_t status = cudaGetLastError();
			
 
				+    if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
			
 
				 }
			
--- a/examples/pi/pi_kernel.cu
+++ b/examples/pi/pi_kernel.cu
@@ -137,12 +137,16 @@ extern "C" void cuda_kernel(void *descr[], void *cl_arg)
 
				 	/* each entry of per_block_cnt contains the number of successful shots
			
 
				 	 * in the corresponding block. */
			
 
				 	monte_carlo<<<nblocks, nthread_per_block, 0, starpu_cuda_get_local_stream()>>>(random_numbers_x, random_numbers_y, nx, per_block_cnt);
			
 
				+	cures = cudaGetLastError();
			
 
				+	if (cures != cudaSuccess) STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 
			
 
				 	/* Note that we do not synchronize between kernel calls because there is an implicit serialization */
			
 
				 
			
 
				 	/* compute the total number of successful shots by adding the elements
			
 
				 	 * of the per_block_cnt array */
			
 
				 	sum_per_block_cnt<<<1, nblocks, 0, starpu_cuda_get_local_stream()>>>(per_block_cnt, cnt);
			
 
				+	cures = cudaGetLastError();
			
 
				+	if (cures != cudaSuccess) STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 	cures = cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 	if (cures)
			
 
				 		STARPU_CUDA_REPORT_ERROR(cures);
			
--- a/examples/pi/pi_redux_kernel.cu
+++ b/examples/pi/pi_redux_kernel.cu
@@ -115,12 +115,16 @@ extern "C" void pi_redux_cuda_kernel(float *x, float *y, unsigned n, unsigned lo
 
				 	/* each entry of per_block_cnt contains the number of successful shots
			
 
				 	 * in the corresponding block. */
			
 
				 	monte_carlo<<<nblocks, nthread_per_block, 0, starpu_cuda_get_local_stream()>>>(x, y, n, per_block_cnt);
			
 
				+	cures = cudaGetLastError();
			
 
				+	if (cures != cudaSuccess) STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 
			
 
				 	/* Note that we do not synchronize between kernel calls because there is an implicit serialization */
			
 
				 
			
 
				 	/* compute the total number of successful shots by adding the elements
			
 
				 	 * of the per_block_cnt array */
			
 
				 	sum_per_block_cnt<<<1, nblocks, 0, starpu_cuda_get_local_stream()>>>(per_block_cnt, shot_cnt);
			
 
				+	cures = cudaGetLastError();
			
 
				+	if (cures != cudaSuccess) STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 	cures = cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 	if (cures)
			
 
				 		STARPU_CUDA_REPORT_ERROR(cures);
			
--- a/examples/reductions/dot_product_kernels.cu
+++ b/examples/reductions/dot_product_kernels.cu
@@ -33,4 +33,6 @@ extern "C" void redux_cuda_func(void *descr[], void *_args)
 
				 	DOT_TYPE *dotb = (DOT_TYPE *)STARPU_VARIABLE_GET_PTR(descr[1]);
			
 
				 
			
 
				 	cuda_redux<<<1,1, 0, starpu_cuda_get_local_stream()>>>(dota, dotb);
			
 
				+	cudaError_t status = cudaGetLastError();
			
 
				+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
			
 
				 }
			
--- a/examples/sched_ctx/axpy_partition_gpu.cu
+++ b/examples/sched_ctx/axpy_partition_gpu.cu
@@ -73,4 +73,6 @@ extern "C" void cuda_axpy(void *descr[], void *_args)
 
				 	__P_HOSTSETUP(saxpy_partitioned,dim3(dimensions,1,1),dimensions,0,SM_mapping_start,SM_allocation,stream);
			
 
				 
			
 
				   	saxpy_partitioned<<<width,dimensions,0,stream>>>(__P_HKARGS,n,a,x,y);
			
 
				+	cudaError_t status = cudaGetLastError();
			
 
				+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
			
 
				 }
			
--- a/examples/spmv/spmv_cuda.cu
+++ b/examples/spmv/spmv_cuda.cu
@@ -97,6 +97,8 @@ extern "C" void spmv_kernel_cuda(void *descr[], void *args)
 
				 
			
 
				 	spmv_kernel_3<<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>>
			
 
				 		(nnz, nrow, nzval, colind, rowptr, firstentry, vecin, nx_in, vecout, nx_out);
			
 
				+	cudaError_t status = cudaGetLastError();
			
 
				+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
			
 
				 }
			
 
				 
			
 
				 
			
--- a/examples/stencil/life_cuda.cu
+++ b/examples/stencil/life_cuda.cu
@@ -73,4 +73,6 @@ extern "C" void cuda_life_update_host(int bz, const TYPE *old, TYPE *newp, int n
 
				 	dim3 dimGrid((nx + threads_per_dim_x-1) / threads_per_dim_x, (ny + threads_per_dim_y-1) / threads_per_dim_y);
			
 
				 #endif
			
 
				 	cuda_life_update <<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>> (bz, old, newp, nx, ny, nz, ldy, ldz, iter);
			
 
				+	cudaError_t status = cudaGetLastError();
			
 
				+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
			
 
				 }
			
--- a/examples/stencil/shadow.cu
+++ b/examples/stencil/shadow.cu
@@ -53,4 +53,6 @@ extern "C" void cuda_shadow_host(int bz, TYPE *ptr, int nx, int ny, int nz, int
 
				 	dim3 dimGrid((nx + threads_per_dim_x-1) / threads_per_dim_x, (ny + threads_per_dim_y-1) / threads_per_dim_y);
			
 
				 #endif
			
 
				 	cuda_shadow <<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>> (bz, ptr, nx, ny, nz, ldy, ldz, i);
			
 
				+	cudaError_t status = cudaGetLastError();
			
 
				+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
			
 
				 }
			
--- a/include/starpu_config.h.in
+++ b/include/starpu_config.h.in
@@ -187,6 +187,15 @@
 
				 #undef STARPU_NMAXBUFS
			
 
				 
			
 
				 /**
			
 
				+   Define the maximum number of fxt mpi files that can be read when
			
 
				+   generating traces. The default value is 64, it can be changed by
			
 
				+   using the configure option \ref enable-fxt-max-files
			
 
				+   "--enable-fxt-max-files".
			
 
				+   @ingroup API_MPI_Support
			
 
				+*/
			
 
				+#undef STARPU_FXT_MAX_FILES
			
 
				+
			
 
				+/**
			
 
				    Define the maximum number of CPU workers managed by StarPU. The
			
 
				    default value can be modified at configure by using the option \ref
			
 
				    enable-maxcpus "--enable-maxcpus".
			
--- a/include/starpu_fxt.h
+++ b/include/starpu_fxt.h
@@ -20,6 +20,7 @@
 
				 #ifndef __STARPU_FXT_H__
			
 
				 #define __STARPU_FXT_H__
			
 
				 
			
 
				+#include <starpu_config.h>
			
 
				 #include <starpu_perfmodel.h>
			
 
				 
			
 
				 #ifdef __cplusplus
			
@@ -32,8 +33,6 @@ extern "C"
 
				    @{
			
 
				 */
			
 
				 
			
 
				-#define STARPU_FXT_MAX_FILES	64
			
 
				-
			
 
				 struct starpu_fxt_codelet_event
			
 
				 {
			
 
				 	char symbol[256];
			
--- a/include/starpu_helper.h
+++ b/include/starpu_helper.h
@@ -20,6 +20,10 @@
 
				 #include <stdio.h>
			
 
				 #include <starpu.h>
			
 
				 
			
 
				+#ifdef STARPU_HAVE_HWLOC
			
 
				+#include <hwloc.h>
			
 
				+#endif
			
 
				+
			
 
				 #ifdef __cplusplus
			
 
				 extern "C"
			
 
				 {
			
@@ -190,6 +194,19 @@ int starpu_data_cpy(starpu_data_handle_t dst_handle, starpu_data_handle_t src_ha
 
				 */
			
 
				 void starpu_display_bindings(void);
			
 
				 
			
 
				+/**
			
 
				+   If \c hwloc is used, convert the given \p logical_index of a PU to the OS
			
 
				+   index of this PU. If \c hwloc is not used, return \p logical_index.
			
 
				+*/
			
 
				+int starpu_get_pu_os_index(unsigned logical_index);
			
 
				+
			
 
				+#ifdef STARPU_HAVE_HWLOC
			
 
				+/**
			
 
				+   Get the hwloc topology used by StarPU. One can use this pointer to get
			
 
				+   information about topology, but not to change settings related to topology.
			
 
				+*/
			
 
				+hwloc_topology_t starpu_get_hwloc_topology(void);
			
 
				+#endif
			
 
				 /** @} */
			
 
				 
			
 
				 #ifdef __cplusplus
			
--- a/include/starpu_scheduler.h
+++ b/include/starpu_scheduler.h
@@ -186,6 +186,11 @@ struct starpu_sched_policy
 
				 	*/
			
 
				 	void (*remove_workers)(unsigned sched_ctx_id, int *workerids, unsigned nworkers);
			
 
				 
			
 
				+	/** Whether this scheduling policy does data prefetching, and thus the
			
 
				+	    core should not try to do it opportunistically.
			
 
				+	*/
			
 
				+	int prefetches;
			
 
				+
			
 
				 	/**
			
 
				 	   Optional field. Name of the policy.
			
 
				 	*/
			
--- a/include/starpu_stdlib.h
+++ b/include/starpu_stdlib.h
@@ -239,9 +239,32 @@ void starpu_memory_deallocate(unsigned node, size_t size);
 
				 */
			
 
				 void starpu_memory_wait_available(unsigned node, size_t size);
			
 
				 
			
 
				+/**
			
 
				+   Sleep for the given \p nb_sec seconds.
			
 
				+   In simgrid mode, this only sleeps within virtual time.
			
 
				+  */
			
 
				 void starpu_sleep(float nb_sec);
			
 
				+
			
 
				+/**
			
 
				+   Sleep for the given \p nb_micro_sec micro-seconds.
			
 
				+   In simgrid mode, this only sleeps within virtual time.
			
 
				+  */
			
 
				 void starpu_usleep(float nb_micro_sec);
			
 
				 
			
 
				+/**
			
 
				+   Account for \p joules J being used.
			
 
				+   This is support in simgrid mode, to record how much energy was used, and will
			
 
				+   show up in further call to starpu_energy_used().
			
 
				+  */
			
 
				+void starpu_energy_use(float joules);
			
 
				+
			
 
				+/**
			
 
				+   Return the amount of energy having been used in J.
			
 
				+   This account the amounts passed to starpu_energy_use(), but also the static
			
 
				+   energy use set by the \ref STARPU_IDLE_POWER environment variable.
			
 
				+  */
			
 
				+double starpu_energy_used(void);
			
 
				+
			
 
				 /** @} */
			
 
				 
			
 
				 #ifdef __cplusplus
			
--- a/include/starpu_task.h
+++ b/include/starpu_task.h
@@ -563,7 +563,7 @@ struct starpu_codelet
 
				 
			
 
				 	/**
			
 
				 	   Optional pointer to the task energy consumption performance
			
 
				-	   model associated to this codelet. This optional field is
			
 
				+	   model associated to this codelet (in J). This optional field is
			
 
				 	   ignored when set to <c>NULL</c> or when its field
			
 
				 	   starpu_perfmodel::symbol is not set. In the case of
			
 
				 	   parallel codelets, this has to account for all processing
			
--- a/julia/examples/mult/gpu_mult.cu
+++ b/julia/examples/mult/gpu_mult.cu
@@ -79,6 +79,8 @@ extern "C" void gpu_mult(void * descr[], void * args)
 
				 	gpuMultKernel
			
 
				 		<<< nblocks, THREADS_PER_BLOCK, 0, NULL /*starpu_cuda_get_local_stream()*/
			
 
				 		>>> (nxC, nyC, nyA, ldA, ldB, ldC, d_subA, d_subB, d_subC);
			
 
				+	cudaError_t status = cudaGetLastError();
			
 
				+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
			
 
				 
			
 
				 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 
			
--- a/julia/examples/old_examples/gpu_mult.cu
+++ b/julia/examples/old_examples/gpu_mult.cu
@@ -78,6 +78,8 @@ extern "C" void gpu_mult(void * descr[], void * args)
 
				 	gpuMultKernel
			
 
				 		<<< nblocks, THREADS_PER_BLOCK, 0, starpu_cuda_get_local_stream()
			
 
				 		>>> (nxC, nyC, nyA, ldA, ldB, ldC, d_subA, d_subB, d_subC);
			
 
				+	cudaError_t status = cudaGetLastError();
			
 
				+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
			
 
				 
			
 
				 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 
			
--- a/julia/examples/old_examples/mandelbrot/gpu_mandelbrot.cu
+++ b/julia/examples/old_examples/mandelbrot/gpu_mandelbrot.cu
@@ -106,6 +106,8 @@ extern "C" void gpu_mandelbrot(void *descr[], void *args)
 
				   nblocks = (nxP * nyP + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK;
			
 
				 
			
 
				   gpuMandelbrotKernel <<< nblocks, THREADS_PER_BLOCK, 0, starpu_cuda_get_local_stream() >>> (nxP, nyP, ldP, d_subP, *params);
			
 
				+  cudaError_t status = cudaGetLastError();
			
 
				+  if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
			
 
				 
			
 
				   cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 }
			
--- a/julia/examples/old_examples/mandelbrot/gpu_mandelbrot_between.cu
+++ b/julia/examples/old_examples/mandelbrot/gpu_mandelbrot_between.cu
@@ -123,6 +123,8 @@ extern "C" void CUDA_mandelbrot(void** buffers_uwrYFDVe, void* cl_arg_uwrYFDVe)
 
				              ptr_qoUGBRtY, local_height, conv_limit, ptr_A5zD9sJZ, 
			
 
				              ld_A5zD9sJZ);
			
 
				     ;
			
 
				+    cudaError_t status = cudaGetLastError();
			
 
				+    if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
			
 
				     cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 }
			
 
				 
			
--- a/julia/examples/old_examples/mult/gpu_mult.cu
+++ b/julia/examples/old_examples/mult/gpu_mult.cu
@@ -78,6 +78,8 @@ extern "C" void gpu_mult(void * descr[], void * args)
 
				 	gpuMultKernel
			
 
				 		<<< nblocks, THREADS_PER_BLOCK, 0, starpu_cuda_get_local_stream()
			
 
				 		>>> (nxC, nyC, nyA, ldA, ldB, ldC, d_subA, d_subB, d_subC);
			
 
				+	cudaError_t status = cudaGetLastError();
			
 
				+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
			
 
				 
			
 
				 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 
			
--- a/julia/examples/old_examples/nbody/gpu_nbody.cu
+++ b/julia/examples/old_examples/nbody/gpu_nbody.cu
@@ -94,6 +94,8 @@ extern "C" void gpu_nbody(void * descr[], void * args)
 
				   gpuNbodyKernel
			
 
				     <<< nblocks, THREADS_PER_BLOCK, 0, starpu_cuda_get_local_stream()
			
 
				     >>> (d_P,  d_subA, d_M, nxP, nxA, nxM, ldP, ldA, *params);
			
 
				+  cudaError_t status = cudaGetLastError();
			
 
				+  if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
			
 
				 
			
 
				   cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 
			
@@ -156,6 +158,8 @@ extern "C" void gpu_nbody2(void * descr[], void *args)
 
				   gpuNbody2Kernel
			
 
				     <<< nblocks, THREADS_PER_BLOCK, 0, starpu_cuda_get_local_stream()
			
 
				     >>> (d_subP, d_subV, d_subA, nxP, nxV, nxA, ldP, ldV, ldA, *params);
			
 
				+  cudaError_t status = cudaGetLastError();
			
 
				+  if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
			
 
				 
			
 
				   cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 }
			
--- a/julia/examples/old_examples/nbody/gpu_nbody_between.cu
+++ b/julia/examples/old_examples/nbody/gpu_nbody_between.cu
@@ -161,6 +161,8 @@ extern "C" void CUDA_nbody_updt(void** buffers_gj6UYWT4, void* cl_arg_gj6UYWT4)
 
				              ld_jJ5f8wMA, ptr_piPvdbTs, ld_piPvdbTs, ptr_JBaPgPiT, 
			
 
				              ptr_0STm2S4k, ld_0STm2S4k);
			
 
				     ;
			
 
				+    cudaError_t status = cudaGetLastError();
			
 
				+    if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
			
 
				     cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 }
			
 
				 
			
--- a/julia/src/compiler/expressions.jl
+++ b/julia/src/compiler/expressions.jl
@@ -335,6 +335,10 @@ function print(io :: IO, expr :: StarpuExprCudaCall ; indent = 0,restrict=false)
 
				 
			
 
				     print(io, ");")
			
 
				     print_newline(io, indent)
			
 
				+    print(io, "cudaError_t status = cudaGetLastError();")
			
 
				+    print_newline(io, indent)
			
 
				+    print(io, "if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);")
			
 
				+    print_newline(io, indent)
			
 
				 
			
 
				 end
			
 
				 
			
--- a/mpi/examples/matrix_decomposition/mpi_cholesky.c
+++ b/mpi/examples/matrix_decomposition/mpi_cholesky.c
@@ -58,7 +58,7 @@ int main(int argc, char **argv)
 
				 #ifndef STARPU_SIMGRID
			
 
				 	matrix_display(bmat, rank);
			
 
				 
			
 
				-	if (check)
			
 
				+	if (check && rank == 0)
			
 
				 		dw_cholesky_check_computation(bmat, rank, nodes, &correctness, &flops, 0.001);
			
 
				 #endif
			
 
				 
			
--- a/mpi/examples/matrix_decomposition/mpi_cholesky_codelets.c
+++ b/mpi/examples/matrix_decomposition/mpi_cholesky_codelets.c
@@ -68,6 +68,212 @@ static struct starpu_codelet cl22 =
 
				 	.color = 0x00ff00,
			
 
				 };
			
 
				 
			
 
				+static void run_cholesky(starpu_data_handle_t **data_handles, int rank, int nodes)
			
 
				+{
			
 
				+	unsigned k, m, n;
			
 
				+	unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
			
 
				+
			
 
				+	for (k = 0; k < nblocks; k++)
			
 
				+	{
			
 
				+		starpu_iteration_push(k);
			
 
				+
			
 
				+		starpu_mpi_task_insert(MPI_COMM_WORLD, &cl11,
			
 
				+				       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k) : STARPU_MAX_PRIO,
			
 
				+				       STARPU_RW, data_handles[k][k],
			
 
				+				       0);
			
 
				+
			
 
				+		for (m = k+1; m<nblocks; m++)
			
 
				+		{
			
 
				+			starpu_mpi_task_insert(MPI_COMM_WORLD, &cl21,
			
 
				+					       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
			
 
				+					       STARPU_R, data_handles[k][k],
			
 
				+					       STARPU_RW, data_handles[m][k],
			
 
				+					       0);
			
 
				+
			
 
				+			starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[k][k]);
			
 
				+			if (my_distrib(k, k, nodes) == rank)
			
 
				+				starpu_data_wont_use(data_handles[k][k]);
			
 
				+
			
 
				+			for (n = k+1; n<nblocks; n++)
			
 
				+			{
			
 
				+				if (n <= m)
			
 
				+				{
			
 
				+					starpu_mpi_task_insert(MPI_COMM_WORLD, &cl22,
			
 
				+							       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m - n) : ((n == k+1) && (m == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
			
 
				+							       STARPU_R, data_handles[n][k],
			
 
				+							       STARPU_R, data_handles[m][k],
			
 
				+							       STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
			
 
				+							       0);
			
 
				+				}
			
 
				+			}
			
 
				+
			
 
				+			starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[m][k]);
			
 
				+			if (my_distrib(m, k, nodes) == rank)
			
 
				+				starpu_data_wont_use(data_handles[m][k]);
			
 
				+		}
			
 
				+		starpu_iteration_pop();
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/* TODO: generated from compiler polyhedral analysis of classical algorithm */
			
 
				+static void run_cholesky_column(starpu_data_handle_t **data_handles, int rank, int nodes)
			
 
				+{
			
 
				+	unsigned k, m, n;
			
 
				+	unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
			
 
				+
			
 
				+	/* Column */
			
 
				+	for (n = 0; n<nblocks; n++)
			
 
				+	{
			
 
				+		starpu_iteration_push(n);
			
 
				+
			
 
				+		/* Row */
			
 
				+		for (m = n; m<nblocks; m++)
			
 
				+		{
			
 
				+			for (k = 0; k < n; k++)
			
 
				+			{
			
 
				+				/* Accumulate updates from TRSMs */
			
 
				+				starpu_mpi_task_insert(MPI_COMM_WORLD, &cl22,
			
 
				+						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m - n) : ((n == k+1) && (m == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
			
 
				+						       STARPU_R, data_handles[n][k],
			
 
				+						       STARPU_R, data_handles[m][k],
			
 
				+						       STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
			
 
				+						       0);
			
 
				+			}
			
 
				+			k = n;
			
 
				+			if (m > n)
			
 
				+			{
			
 
				+				/* non-diagonal block, solve */
			
 
				+				starpu_mpi_task_insert(MPI_COMM_WORLD, &cl21,
			
 
				+						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
			
 
				+						       STARPU_R, data_handles[k][k],
			
 
				+						       STARPU_RW, data_handles[m][k],
			
 
				+						       0);
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+				/* diagonal block, factorize */
			
 
				+				starpu_mpi_task_insert(MPI_COMM_WORLD, &cl11,
			
 
				+						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k) : STARPU_MAX_PRIO,
			
 
				+						       STARPU_RW, data_handles[k][k],
			
 
				+						       0);
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		starpu_iteration_pop();
			
 
				+	}
			
 
				+
			
 
				+	/* Submit flushes, StarPU will fit them according to the progress */
			
 
				+	starpu_mpi_cache_flush_all_data(MPI_COMM_WORLD);
			
 
				+	for (m = 0; m < nblocks; m++)
			
 
				+		for (n = 0; n < nblocks ; n++)
			
 
				+			starpu_data_wont_use(data_handles[m][n]);
			
 
				+}
			
 
				+
			
 
				+/* TODO: generated from compiler polyhedral analysis of classical algorithm */
			
 
				+static void run_cholesky_antidiagonal(starpu_data_handle_t **data_handles, int rank, int nodes)
			
 
				+{
			
 
				+	unsigned a, b, c;
			
 
				+	unsigned k, m, n;
			
 
				+	unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
			
 
				+
			
 
				+	/* double-antidiagonal number:
			
 
				+	 * - a=0 contains (0,0) plus (1,0)
			
 
				+	 * - a=1 contains (2,0), (1,1) plus (3,0), (2, 1)
			
 
				+	 * - etc.
			
 
				+	 */
			
 
				+	for (a = 0; a < nblocks; a++)
			
 
				+	{
			
 
				+		starpu_iteration_push(a);
			
 
				+
			
 
				+		unsigned bfirst;
			
 
				+		if (2*a < nblocks)
			
 
				+			bfirst = 0;
			
 
				+		else
			
 
				+			bfirst = 2*a - (nblocks-1);
			
 
				+
			
 
				+		/* column within first antidiagonal for a */
			
 
				+		for (b = bfirst; b <= a; b++)
			
 
				+		{
			
 
				+			/* column */
			
 
				+			n = b;
			
 
				+			/* row */
			
 
				+			m = 2*a-b;
			
 
				+
			
 
				+			/* Accumulate updates from TRSMs */
			
 
				+			for (c = 0; c < n; c++)
			
 
				+			{
			
 
				+				k = c;
			
 
				+				starpu_mpi_task_insert(MPI_COMM_WORLD, &cl22,
			
 
				+						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m - n) : ((n == k+1) && (m == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
			
 
				+						       STARPU_R, data_handles[n][k],
			
 
				+						       STARPU_R, data_handles[m][k],
			
 
				+						       STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
			
 
				+						       0);
			
 
				+			}
			
 
				+
			
 
				+			if (b < a)
			
 
				+			{
			
 
				+				/* non-diagonal block, solve */
			
 
				+				k = n;
			
 
				+				starpu_mpi_task_insert(MPI_COMM_WORLD, &cl21,
			
 
				+						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
			
 
				+						       STARPU_R, data_handles[k][k],
			
 
				+						       STARPU_RW, data_handles[m][k],
			
 
				+						       0);
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+				/* diagonal block, factorize */
			
 
				+				k = a;
			
 
				+				starpu_mpi_task_insert(MPI_COMM_WORLD, &cl11,
			
 
				+						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k) : STARPU_MAX_PRIO,
			
 
				+						       STARPU_RW, data_handles[k][k],
			
 
				+						       0);
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		/* column within second antidiagonal for a */
			
 
				+		for (b = bfirst; b <= a; b++)
			
 
				+		{
			
 
				+			/* column */
			
 
				+			n = b;
			
 
				+			/* row */
			
 
				+			m = 2*a-b + 1;
			
 
				+
			
 
				+			if (m >= nblocks)
			
 
				+				/* Skip first item when even number of tiles */
			
 
				+				continue;
			
 
				+
			
 
				+			/* Accumulate updates from TRSMs */
			
 
				+			for (c = 0; c < n; c++)
			
 
				+			{
			
 
				+				k = c;
			
 
				+				starpu_mpi_task_insert(MPI_COMM_WORLD, &cl22,
			
 
				+						       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m - n) : ((n == k+1) && (m == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
			
 
				+						       STARPU_R, data_handles[n][k],
			
 
				+						       STARPU_R, data_handles[m][k],
			
 
				+						       STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
			
 
				+						       0);
			
 
				+			}
			
 
				+			/* non-diagonal block, solve */
			
 
				+			k = n;
			
 
				+			starpu_mpi_task_insert(MPI_COMM_WORLD, &cl21,
			
 
				+					       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
			
 
				+					       STARPU_R, data_handles[k][k],
			
 
				+					       STARPU_RW, data_handles[m][k],
			
 
				+					       0);
			
 
				+		}
			
 
				+
			
 
				+		starpu_iteration_pop();
			
 
				+	}
			
 
				+
			
 
				+	/* Submit flushes, StarPU will fit them according to the progress */
			
 
				+	starpu_mpi_cache_flush_all_data(MPI_COMM_WORLD);
			
 
				+	for (m = 0; m < nblocks; m++)
			
 
				+		for (n = 0; n < nblocks ; n++)
			
 
				+			starpu_data_wont_use(data_handles[m][n]);
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  *	code to bootstrap the factorization
			
 
				  *	and construct the DAG
			
@@ -79,8 +285,6 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 
				 	starpu_data_handle_t **data_handles;
			
 
				 	unsigned k, m, n;
			
 
				 
			
 
				-	unsigned unbound_prio = STARPU_MAX_PRIO == INT_MAX && STARPU_MIN_PRIO == INT_MIN;
			
 
				-
			
 
				 	/* create all the DAG nodes */
			
 
				 
			
 
				 	data_handles = malloc(nblocks*sizeof(starpu_data_handle_t *));
			
@@ -91,7 +295,7 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 
				 		for(n = 0; n < nblocks ; n++)
			
 
				 		{
			
 
				 			int mpi_rank = my_distrib(m, n, nodes);
			
 
				-			if (mpi_rank == rank)
			
 
				+			if (mpi_rank == rank || (check && rank == 0))
			
 
				 			{
			
 
				 				//fprintf(stderr, "[%d] Owning data[%d][%d]\n", rank, n, m);
			
 
				 				starpu_matrix_data_register(&data_handles[m][n], STARPU_MAIN_RAM, (uintptr_t)matA[m][n],
			
@@ -119,50 +323,16 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 
				 	starpu_mpi_barrier(MPI_COMM_WORLD);
			
 
				 	start = starpu_timing_now();
			
 
				 
			
 
				-	for (k = 0; k < nblocks; k++)
			
 
				+	switch (submission)
			
 
				 	{
			
 
				-		starpu_iteration_push(k);
			
 
				-
			
 
				-		starpu_mpi_task_insert(MPI_COMM_WORLD, &cl11,
			
 
				-				       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k) : STARPU_MAX_PRIO,
			
 
				-				       STARPU_RW, data_handles[k][k],
			
 
				-				       0);
			
 
				-
			
 
				-		for (m = k+1; m<nblocks; m++)
			
 
				-		{
			
 
				-			starpu_mpi_task_insert(MPI_COMM_WORLD, &cl21,
			
 
				-					       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m) : (m == k+1)?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
			
 
				-					       STARPU_R, data_handles[k][k],
			
 
				-					       STARPU_RW, data_handles[m][k],
			
 
				-					       0);
			
 
				-
			
 
				-			starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[k][k]);
			
 
				-			if (my_distrib(k, k, nodes) == rank)
			
 
				-				starpu_data_wont_use(data_handles[k][k]);
			
 
				-
			
 
				-			for (n = k+1; n<nblocks; n++)
			
 
				-			{
			
 
				-				if (n <= m)
			
 
				-				{
			
 
				-					starpu_mpi_task_insert(MPI_COMM_WORLD, &cl22,
			
 
				-							       STARPU_PRIORITY, noprio ? STARPU_DEFAULT_PRIO : unbound_prio ? (int)(2*nblocks - 2*k - m - n) : ((n == k+1) && (m == k+1))?STARPU_MAX_PRIO:STARPU_DEFAULT_PRIO,
			
 
				-							       STARPU_R, data_handles[n][k],
			
 
				-							       STARPU_R, data_handles[m][k],
			
 
				-							       STARPU_RW | STARPU_COMMUTE, data_handles[m][n],
			
 
				-							       0);
			
 
				-				}
			
 
				-			}
			
 
				-
			
 
				-			starpu_mpi_cache_flush(MPI_COMM_WORLD, data_handles[m][k]);
			
 
				-			if (my_distrib(m, k, nodes) == rank)
			
 
				-				starpu_data_wont_use(data_handles[m][k]);
			
 
				-		}
			
 
				-		starpu_iteration_pop();
			
 
				+		case TRIANGLES:		run_cholesky(data_handles, rank, nodes); break;
			
 
				+		case COLUMNS:		run_cholesky_column(data_handles, rank, nodes); break;
			
 
				+		case ANTIDIAGONALS:	run_cholesky_antidiagonal(data_handles, rank, nodes); break;
			
 
				+		default: STARPU_ABORT();
			
 
				 	}
			
 
				 
			
 
				 	starpu_mpi_wait_for_all(MPI_COMM_WORLD);
			
 
				 	starpu_mpi_barrier(MPI_COMM_WORLD);
			
 
				-
			
 
				 	end = starpu_timing_now();
			
 
				 
			
 
				 	for (m = 0; m < nblocks; m++)
			
@@ -170,7 +340,7 @@ void dw_cholesky(float ***matA, unsigned ld, int rank, int nodes, double *timing
 
				 		for(n = 0; n < nblocks ; n++)
			
 
				 		{
			
 
				 			/* Get back data on node 0 for the check */
			
 
				-			if (check)
			
 
				+			if (check && data_handles[m][n])
			
 
				 				starpu_mpi_get_data_on_node(MPI_COMM_WORLD, data_handles[m][n], 0);
			
 
				 
			
 
				 			if (data_handles[m][n])
			
@@ -248,24 +418,20 @@ void dw_cholesky_check_computation(float ***matA, int rank, int nodes, int *corr
 
				 	{
			
 
				 		for (m = 0; m < nblocks; m++)
			
 
				 		{
			
 
				-			int mpi_rank = my_distrib(m, n, nodes);
			
 
				-			if (mpi_rank == rank)
			
 
				+			for (nn = BLOCKSIZE*n ; nn < BLOCKSIZE*(n+1); nn++)
			
 
				 			{
			
 
				-				for (nn = (size/nblocks)*n ; nn < (size/nblocks)*n+(size/nblocks); nn++)
			
 
				+				for (mm = BLOCKSIZE*m ; mm < BLOCKSIZE*(m+1); mm++)
			
 
				 				{
			
 
				-					for (mm = (size/nblocks)*m ; mm < (size/nblocks)*m+(size/nblocks); mm++)
			
 
				+					if (nn <= mm)
			
 
				 					{
			
 
				-						if (nn <= mm)
			
 
				+						float orig = (1.0f/(1.0f+nn+mm)) + ((nn == mm)?1.0f*size:0.0f);
			
 
				+						float err = fabsf(test_mat[mm +nn*size] - orig) / orig;
			
 
				+						if (err > epsilon)
			
 
				 						{
			
 
				-							float orig = (1.0f/(1.0f+nn+mm)) + ((nn == mm)?1.0f*size:0.0f);
			
 
				-							float err = fabsf(test_mat[mm +nn*size] - orig) / orig;
			
 
				-							if (err > epsilon)
			
 
				-							{
			
 
				-								FPRINTF(stderr, "[%d] Error[%u, %u] --> %2.20f != %2.20f (err %2.20f)\n", rank, nn, mm, test_mat[mm +nn*size], orig, err);
			
 
				-								*correctness = 0;
			
 
				-								*flops = 0;
			
 
				-								break;
			
 
				-							}
			
 
				+							FPRINTF(stderr, "[%d] Error[%u, %u] --> %2.20f != %2.20f (err %2.20f)\n", rank, nn, mm, test_mat[mm +nn*size], orig, err);
			
 
				+							*correctness = 0;
			
 
				+							*flops = 0;
			
 
				+							break;
			
 
				 						}
			
 
				 					}
			
 
				 				}
			
--- a/mpi/examples/matrix_decomposition/mpi_decomposition_params.c
+++ b/mpi/examples/matrix_decomposition/mpi_decomposition_params.c
@@ -43,6 +43,7 @@ unsigned check = 0;
 
				 unsigned display = 0;
			
 
				 int dblockx = -1;
			
 
				 int dblocky = -1;
			
 
				+enum submission submission = TRIANGLES;
			
 
				 
			
 
				 void parse_args(int argc, char **argv, int nodes)
			
 
				 {
			
@@ -79,6 +80,16 @@ void parse_args(int argc, char **argv, int nodes)
 
				                         nbigblocks = strtol(argv[++i], &argptr, 10);
			
 
				                 }
			
 
				 
			
 
				+                if (strcmp(argv[i], "-columns") == 0)
			
 
				+                {
			
 
				+                        submission = COLUMNS;
			
 
				+                }
			
 
				+
			
 
				+                if (strcmp(argv[i], "-antidiagonals") == 0)
			
 
				+                {
			
 
				+                        submission = ANTIDIAGONALS;
			
 
				+                }
			
 
				+
			
 
				                 if (strcmp(argv[i], "-no-prio") == 0)
			
 
				                 {
			
 
				                         noprio = 1;
			
@@ -96,7 +107,7 @@ void parse_args(int argc, char **argv, int nodes)
 
				 
			
 
				                 if (strcmp(argv[i], "-h") == 0 || strcmp(argv[i], "--help") == 0)
			
 
				                 {
			
 
				-			printf("usage : %s [-size size] [-nblocks nblocks] [-no-prio] [-display] [-check]\n", argv[0]);
			
 
				+                        printf("usage : %s [-size size] [-nblocks nblocks] [-columns] [-antidiagonals] [-no-prio] [-display] [-check]\n", argv[0]);
			
 
				                 }
			
 
				         }
			
 
				 
			
--- a/mpi/examples/matrix_decomposition/mpi_decomposition_params.h
+++ b/mpi/examples/matrix_decomposition/mpi_decomposition_params.h
@@ -28,6 +28,14 @@ extern unsigned display;
 
				 extern int dblockx;
			
 
				 extern int dblocky;
			
 
				 
			
 
				+enum submission
			
 
				+{
			
 
				+	TRIANGLES,
			
 
				+	COLUMNS,
			
 
				+	ANTIDIAGONALS,
			
 
				+};
			
 
				+extern enum submission submission;
			
 
				+
			
 
				 void parse_args(int argc, char **argv, int nodes);
			
 
				 
			
 
				 #endif // __MPI_CHOLESKY_PARAMS_H__
			
--- a/mpi/include/starpu_mpi.h
+++ b/mpi/include/starpu_mpi.h
@@ -132,6 +132,12 @@ int starpu_mpi_world_size(void);
 
				 */
			
 
				 int starpu_mpi_comm_get_attr(MPI_Comm comm, int keyval, void *attribute_val, int *flag);
			
 
				 
			
 
				+
			
 
				+/**
			
 
				+   Get the logical index of the core where the MPI thread is bound.
			
 
				+*/
			
 
				+int starpu_mpi_get_thread_cpuid(void);
			
 
				+
			
 
				 int starpu_mpi_get_communication_tag(void);
			
 
				 void starpu_mpi_set_communication_tag(int tag);
			
 
				 
			
--- a/mpi/src/starpu_mpi_datatype.c
+++ b/mpi/src/starpu_mpi_datatype.c
@@ -26,17 +26,16 @@ struct _starpu_mpi_datatype_funcs
 
				 	UT_hash_handle hh;
			
 
				 };
			
 
				 
			
 
				-static starpu_pthread_mutex_t _starpu_mpi_datatype_funcs_table_mutex;
			
 
				+/* We want to allow applications calling starpu_mpi_interface_datatype_register/unregister as constructor/destructor */
			
 
				+static starpu_pthread_mutex_t _starpu_mpi_datatype_funcs_table_mutex = STARPU_PTHREAD_MUTEX_INITIALIZER;
			
 
				 static struct _starpu_mpi_datatype_funcs *_starpu_mpi_datatype_funcs_table = NULL;
			
 
				 
			
 
				 void _starpu_mpi_datatype_init(void)
			
 
				 {
			
 
				-	STARPU_PTHREAD_MUTEX_INIT(&_starpu_mpi_datatype_funcs_table_mutex, NULL);
			
 
				 }
			
 
				 
			
 
				 void _starpu_mpi_datatype_shutdown(void)
			
 
				 {
			
 
				-	STARPU_PTHREAD_MUTEX_DESTROY(&_starpu_mpi_datatype_funcs_table_mutex);
			
 
				 }
			
 
				 
			
 
				 /*
			
--- a/mpi/src/starpu_mpi_init.c
+++ b/mpi/src/starpu_mpi_init.c
@@ -336,3 +336,8 @@ int starpu_mpi_world_rank(void)
 
				 	starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
			
 
				 	return rank;
			
 
				 }
			
 
				+
			
 
				+int starpu_mpi_get_thread_cpuid(void)
			
 
				+{
			
 
				+	return _starpu_mpi_thread_cpuid;
			
 
				+}
			
--- a/mpi/tests/ring_kernel.cu
+++ b/mpi/tests/ring_kernel.cu
@@ -27,5 +27,7 @@ extern "C" void increment_cuda(void *descr[], void *_args)
 
				 	int *tokenptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				 
			
 
				 	cuda_incrementer<<<1,1, 0, starpu_cuda_get_local_stream()>>>(tokenptr);
			
 
				+	cudaError_t status = cudaGetLastError();
			
 
				+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
			
 
				 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 }
			
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -411,9 +411,16 @@ endif
 
				 # static inline definition
			
 
				 dist-hook:
			
 
				 	failed=0 ; \
			
 
				-	for i in $$( $(GREP) "static inline" $$(find $(srcdir) -name \*.h) | $(SED) -e 's/.*static inline //g' | $(GREP) -v ENAME | $(SED) -e 's/[^(]* \(\|\*\)\([^ (]*\)(.*/\2/' | $(GREP) -v _starpu_spin_init) ; do \
			
 
				-		for j in $(shell find . -name \*.o) ; do \
			
 
				-			nm $$j | $(GREP) "U $$i$$" && { echo $$j ; failed=1 ; } ; \
			
 
				-		done ; \
			
 
				+	look=""; \
			
 
				+	for i in $$( $(GREP) "static inline" $$(find $(srcdir) -name \*.h) | $(SED) -e 's/.*static inline //g' | $(GREP) -v ENAME\#\# | $(SED) -n -e 's/[^(]* \(\|\*\)\([^ (]*\)(.*/\2/' -e 'p;s/^_*//;p' | $(GREP) -v _starpu_spin_init | $(GREP) -v starpu_sched_ctx_worker_is_master_for_child_ctx) ; do \
			
 
				+		if [ -z "$$look" ] ; then \
			
 
				+			look="$$i" ; \
			
 
				+		else \
			
 
				+			look="$$look\|$$i" ; \
			
 
				+		fi ; \
			
 
				+	done ; \
			
 
				+	echo "$$look" ; \
			
 
				+	for j in $(shell find . -name \*.o) ; do \
			
 
				+		nm $$j | $(GREP) -e "U \($$look\)$$" && { echo $$j ; failed=1 ; } ; \
			
 
				 	done ; \
			
 
				 	[ $$failed == 0 ]
			
--- a/src/core/jobs.c
+++ b/src/core/jobs.c
@@ -24,10 +24,12 @@
 
				 #include <common/config.h>
			
 
				 #include <common/utils.h>
			
 
				 #include <common/graph.h>
			
 
				+#include <datawizard/memory_nodes.h>
			
 
				 #include <profiling/profiling.h>
			
 
				 #include <profiling/bound.h>
			
 
				 #include <core/debug.h>
			
 
				 #include <limits.h>
			
 
				+#include <core/workers.h>
			
 
				 
			
 
				 static int max_memory_use;
			
 
				 static unsigned long njobs, maxnjobs;
			
@@ -483,6 +485,7 @@ void _starpu_handle_job_termination(struct _starpu_job *j)
 
				 		 * also the callback were executed. */
			
 
				 		j->terminated = 2;
			
 
				 	}
			
 
				+	task->prefetched = 0;
			
 
				 	STARPU_PTHREAD_COND_BROADCAST(&j->sync_cond);
			
 
				 	STARPU_AYU_REMOVETASK(j->job_id);
			
 
				 	STARPU_PTHREAD_MUTEX_UNLOCK(&j->sync_mutex);
			
--- a/src/core/perfmodel/perfmodel_bus.c
+++ b/src/core/perfmodel/perfmodel_bus.c
@@ -37,6 +37,7 @@
 
				 #include <core/topology.h>
			
 
				 #include <common/utils.h>
			
 
				 #include <drivers/mpi/driver_mpi_common.h>
			
 
				+#include <datawizard/memory_nodes.h>
			
 
				 
			
 
				 #ifdef STARPU_USE_OPENCL
			
 
				 #include <starpu_opencl.h>
			
@@ -188,7 +189,7 @@ static void measure_bandwidth_between_host_and_dev_on_numa_with_cuda(int dev, in
 
				 	/* Allocate a buffer on the device */
			
 
				 	unsigned char *d_buffer;
			
 
				 	cures = cudaMalloc((void **)&d_buffer, size);
			
 
				-	STARPU_ASSERT(cures == cudaSuccess);
			
 
				+	if (STARPU_UNLIKELY(cures)) STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 
			
 
				 	/* hack to avoid third party libs to rebind threads */
			
 
				 	_starpu_bind_thread_on_cpu(cpu, STARPU_NOWORKERID, NULL);
			
@@ -217,7 +218,7 @@ static void measure_bandwidth_between_host_and_dev_on_numa_with_cuda(int dev, in
 
				 		cudaHostRegister((void *)h_buffer, size, 0);
			
 
				 	}
			
 
				 
			
 
				-	STARPU_ASSERT(cures == cudaSuccess);
			
 
				+	if (STARPU_UNLIKELY(cures)) STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 
			
 
				 	/* hack to avoid third party libs to rebind threads */
			
 
				 	_starpu_bind_thread_on_cpu(cpu, STARPU_NOWORKERID, NULL);
			
@@ -342,7 +343,7 @@ static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
 
				 	/* Allocate a buffer on the device */
			
 
				 	unsigned char *s_buffer;
			
 
				 	cures = cudaMalloc((void **)&s_buffer, size);
			
 
				-	STARPU_ASSERT(cures == cudaSuccess);
			
 
				+	if (STARPU_UNLIKELY(cures)) STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 	cudaMemset(s_buffer, 0, size);
			
 
				 	cudaDeviceSynchronize();
			
 
				 
			
@@ -368,7 +369,7 @@ static void measure_bandwidth_between_dev_and_dev_cuda(int src, int dst)
 
				 	/* Allocate a buffer on the device */
			
 
				 	unsigned char *d_buffer;
			
 
				 	cures = cudaMalloc((void **)&d_buffer, size);
			
 
				-	STARPU_ASSERT(cures == cudaSuccess);
			
 
				+	if (STARPU_UNLIKELY(cures)) STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 	cudaMemset(d_buffer, 0, size);
			
 
				 	cudaDeviceSynchronize();
			
 
				 
			
--- a/src/core/perfmodel/perfmodel_print.c
+++ b/src/core/perfmodel/perfmodel_print.c
@@ -19,6 +19,7 @@
 
				 #include <starpu.h>
			
 
				 #include <starpu_perfmodel.h>
			
 
				 #include <common/config.h>
			
 
				+#include <core/workers.h>
			
 
				 #include "perfmodel.h"
			
 
				 
			
 
				 static
			
--- a/src/core/sched_ctx.c
+++ b/src/core/sched_ctx.c
@@ -21,6 +21,7 @@
 
				 #include <common/utils.h>
			
 
				 #include <stdarg.h>
			
 
				 #include <core/task.h>
			
 
				+#include <core/workers.h>
			
 
				 
			
 
				 enum _starpu_ctx_change_op
			
 
				 {
			
--- a/src/core/sched_policy.c
+++ b/src/core/sched_policy.c
@@ -22,6 +22,7 @@
 
				 #include <common/utils.h>
			
 
				 #include <core/sched_policy.h>
			
 
				 #include <profiling/profiling.h>
			
 
				+#include <datawizard/memory_nodes.h>
			
 
				 #include <common/barrier.h>
			
 
				 #include <core/debug.h>
			
 
				 #include <core/task.h>
			
@@ -569,32 +570,12 @@ int _starpu_push_task_to_workers(struct starpu_task *task)
 
				 	int ret = 0;
			
 
				 	if (STARPU_UNLIKELY(task->execute_on_a_specific_worker))
			
 
				 	{
			
 
				-		if (starpu_get_prefetch_flag())
			
 
				-			starpu_prefetch_task_input_for(task, task->workerid);
			
 
				-
			
 
				 		ret = _starpu_push_task_on_specific_worker(task, task->workerid);
			
 
				 	}
			
 
				 	else
			
 
				 	{
			
 
				 		struct _starpu_machine_config *config = _starpu_get_machine_config();
			
 
				 
			
 
				-		/* When a task can only be executed on a given arch and we have
			
 
				-		 * only one memory node for that arch, we can systematically
			
 
				-		 * prefetch before the scheduling decision. */
			
 
				-		if (starpu_get_prefetch_flag() && starpu_memory_nodes_get_count() > 1)
			
 
				-		{
			
 
				-			if (task->where == STARPU_CPU && config->cpus_nodeid >= 0)
			
 
				-				starpu_prefetch_task_input_on_node(task, config->cpus_nodeid);
			
 
				-			else if (task->where == STARPU_CUDA && config->cuda_nodeid >= 0)
			
 
				-				starpu_prefetch_task_input_on_node(task, config->cuda_nodeid);
			
 
				-                        else if (task->cl->where == STARPU_FPGA && config->fpga_nodeid >= 0)
			
 
				-				starpu_prefetch_task_input_on_node(task, config->fpga_nodeid);
			
 
				-			else if (task->where == STARPU_OPENCL && config->opencl_nodeid >= 0)
			
 
				-				starpu_prefetch_task_input_on_node(task, config->opencl_nodeid);
			
 
				-			else if (task->where == STARPU_MIC && config->mic_nodeid >= 0)
			
 
				-				starpu_prefetch_task_input_on_node(task, config->mic_nodeid);
			
 
				-		}
			
 
				-
			
 
				 		if(!sched_ctx->sched_policy)
			
 
				 		{
			
 
				 			/* Note: we have to call that early, or else the task may have
			
@@ -637,6 +618,25 @@ int _starpu_push_task_to_workers(struct starpu_task *task)
 
				 		}
			
 
				 		else
			
 
				 		{
			
 
				+			/* When a task can only be executed on a given arch and we have
			
 
				+			 * only one memory node for that arch, we can systematically
			
 
				+			 * prefetch before the scheduling decision. */
			
 
				+			if (!sched_ctx->sched_policy->prefetches
			
 
				+				&& starpu_get_prefetch_flag()
			
 
				+				&& starpu_memory_nodes_get_count() > 1)
			
 
				+			{
			
 
				+				if (task->where == STARPU_CPU && config->cpus_nodeid >= 0)
			
 
				+					starpu_prefetch_task_input_on_node(task, config->cpus_nodeid);
			
 
				+				else if (task->where == STARPU_CUDA && config->cuda_nodeid >= 0)
			
 
				+					starpu_prefetch_task_input_on_node(task, config->cuda_nodeid);
			
 
				+				else if (task->cl->where == STARPU_FPGA && config->fpga_nodeid >= 0)
			
 
				+					starpu_prefetch_task_input_on_node(task, config->fpga_nodeid);
			
 
				+				else if (task->where == STARPU_OPENCL && config->opencl_nodeid >= 0)
			
 
				+					starpu_prefetch_task_input_on_node(task, config->opencl_nodeid);
			
 
				+				else if (task->where == STARPU_MIC && config->mic_nodeid >= 0)
			
 
				+					starpu_prefetch_task_input_on_node(task, config->mic_nodeid);
			
 
				+			}
			
 
				+
			
 
				 			STARPU_ASSERT(sched_ctx->sched_policy->push_task);
			
 
				 			/* check out if there are any workers in the context */
			
 
				 			unsigned nworkers = starpu_sched_ctx_get_nworkers(sched_ctx->id);
			
--- a/src/core/simgrid.c
+++ b/src/core/simgrid.c
@@ -58,6 +58,8 @@ extern int _starpu_mpi_simgrid_init(int argc, char *argv[]);
 
				 extern void smpi_process_set_user_data(void *);
			
 
				 #endif
			
 
				 
			
 
				+static double _starpu_simgrid_dynamic_energy = 0.0;
			
 
				+
			
 
				 /* 1 when MSG_init was done, 2 when initialized through redirected main, 3 when
			
 
				  * initialized through MSG_process_attach */
			
 
				 static int simgrid_started;
			
@@ -629,6 +631,7 @@ struct task
 
				 #else
			
 
				 	msg_task_t task;
			
 
				 #endif
			
 
				+	double energy;
			
 
				 
			
 
				 	/* communication termination signalization */
			
 
				 	unsigned *finished;
			
@@ -666,6 +669,7 @@ static void *task_execute(void *arg)
 
				 		MSG_task_execute(task->task);
			
 
				 		MSG_task_destroy(task->task);
			
 
				 #endif
			
 
				+		starpu_energy_use(task->energy);
			
 
				 		_STARPU_DEBUG("task %p finished\n", task);
			
 
				 
			
 
				 		*task->finished = 1;
			
@@ -702,7 +706,7 @@ void _starpu_simgrid_wait_tasks(int workerid)
 
				 }
			
 
				 
			
 
				 /* Task execution submitted by StarPU */
			
 
				-void _starpu_simgrid_submit_job(int workerid, struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch, double length, unsigned *finished)
			
 
				+void _starpu_simgrid_submit_job(int workerid, int sched_ctx_id, struct _starpu_job *j, struct starpu_perfmodel_arch* perf_arch, double length, double energy, unsigned *finished)
			
 
				 {
			
 
				 	struct starpu_task *starpu_task = j->task;
			
 
				 	double flops;
			
@@ -717,13 +721,19 @@ void _starpu_simgrid_submit_job(int workerid, struct _starpu_job *j, struct star
 
				 
			
 
				 	if (isnan(length))
			
 
				 	{
			
 
				-		length = starpu_task_expected_length(starpu_task, perf_arch, j->nimpl);
			
 
				+		length = starpu_task_worker_expected_length(starpu_task, workerid, sched_ctx_id, j->nimpl);
			
 
				 		STARPU_ASSERT_MSG(!_STARPU_IS_ZERO(length) && !isnan(length),
			
 
				 				  "Codelet %s does not have a perfmodel (in directory %s), or is not calibrated enough, please re-run in non-simgrid mode until it is calibrated",
			
 
				 				  _starpu_job_get_model_name(j), _starpu_get_perf_model_dir_codelet());
			
 
				                 /* TODO: option to add variance according to performance model,
			
 
				                  * to be able to easily check scheduling robustness */
			
 
				 	}
			
 
				+	if (isnan(energy))
			
 
				+	{
			
 
				+		energy = starpu_task_worker_expected_energy(starpu_task, workerid, sched_ctx_id, j->nimpl);
			
 
				+		/* TODO: option to add variance according to performance model,
			
 
				+		 * to be able to easily check scheduling robustness */
			
 
				+	}
			
 
				 
			
 
				 #if defined(HAVE_SG_HOST_SPEED) || defined(sg_host_speed)
			
 
				 #  if defined(HAVE_SG_HOST_SELF) || defined(sg_host_self)
			
@@ -754,6 +764,7 @@ void _starpu_simgrid_submit_job(int workerid, struct _starpu_job *j, struct star
 
				 		MSG_task_execute(simgrid_task);
			
 
				 		MSG_task_destroy(simgrid_task);
			
 
				 #endif
			
 
				+		starpu_energy_use(energy);
			
 
				 	}
			
 
				 	else
			
 
				 	{
			
@@ -766,6 +777,7 @@ void _starpu_simgrid_submit_job(int workerid, struct _starpu_job *j, struct star
 
				 #else
			
 
				 		task->task = simgrid_task;
			
 
				 #endif
			
 
				+		task->energy = energy;
			
 
				 		task->finished = finished;
			
 
				 		*finished = 0;
			
 
				 		task->next = NULL;
			
@@ -1391,5 +1403,15 @@ void _starpu_simgrid_data_transfer(size_t size, unsigned src_node, unsigned dst_
 
				 }
			
 
				 #endif
			
 
				 
			
 
				+void starpu_energy_use(float joules)
			
 
				+{
			
 
				+	_starpu_simgrid_dynamic_energy += joules;
			
 
				+}
			
 
				+
			
 
				+double starpu_energy_used(void)
			
 
				+{
			
 
				+	float idle_power = starpu_get_env_float_default("STARPU_IDLE_POWER", 0.0);
			
 
				+	return _starpu_simgrid_dynamic_energy + idle_power * starpu_timing_now() / 1000000;
			
 
				+}
			
 
				 
			
 
				 #endif
			
--- a/src/core/simgrid.h
+++ b/src/core/simgrid.h
@@ -66,7 +66,7 @@ void _starpu_simgrid_deinit_late(void);
 
				 void _starpu_simgrid_actor_setup(void);
			
 
				 void _starpu_simgrid_wait_tasks(int workerid);
			
 
				 struct _starpu_job;
			
 
				-void _starpu_simgrid_submit_job(int workerid, struct _starpu_job *job, struct starpu_perfmodel_arch* perf_arch, double length, unsigned *finished);
			
 
				+void _starpu_simgrid_submit_job(int workerid, int sched_ctx_id, struct _starpu_job *job, struct starpu_perfmodel_arch* perf_arch, double length, double energy, unsigned *finished);
			
 
				 struct _starpu_data_request;
			
 
				 int _starpu_simgrid_transfer(size_t size, unsigned src_node, unsigned dst_node, struct _starpu_data_request *req);
			
 
				 union _starpu_async_channel_event;
			
--- a/src/core/task.c
+++ b/src/core/task.c
@@ -30,6 +30,7 @@
 
				 #include <common/utils.h>
			
 
				 #include <common/fxt.h>
			
 
				 #include <common/knobs.h>
			
 
				+#include <datawizard/memory_nodes.h>
			
 
				 #include <profiling/profiling.h>
			
 
				 #include <profiling/bound.h>
			
 
				 #include <math.h>
			
--- a/src/core/topology.c
+++ b/src/core/topology.c
@@ -2086,7 +2086,11 @@ int _starpu_bind_thread_on_cpu(int cpuid STARPU_ATTRIBUTE_UNUSED, int workerid S
 
				 		{
			
 
				 			cpu_worker[cpuid] = workerid;
			
 
				 			if (name)
			
 
				+			{
			
 
				+				if (cpu_name[cpuid])
			
 
				+					free(cpu_name[cpuid]);
			
 
				 				cpu_name[cpuid] = strdup(name);
			
 
				+			}
			
 
				 		}
			
 
				 	}
			
 
				 
			
@@ -3219,3 +3223,26 @@ void starpu_topology_print(FILE *output)
 
				 		fprintf(output, "\n");
			
 
				 	}
			
 
				 }
			
 
				+
			
 
				+int starpu_get_pu_os_index(unsigned logical_index)
			
 
				+{
			
 
				+#ifdef STARPU_HAVE_HWLOC
			
 
				+	struct _starpu_machine_config *config = _starpu_get_machine_config();
			
 
				+	struct _starpu_machine_topology *topology = &config->topology;
			
 
				+
			
 
				+	hwloc_topology_t topo = topology->hwtopology;
			
 
				+
			
 
				+	return hwloc_get_obj_by_type(topo, HWLOC_OBJ_PU, logical_index)->os_index;
			
 
				+#else
			
 
				+	return logical_index;
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+#ifdef STARPU_HAVE_HWLOC
			
 
				+hwloc_topology_t starpu_get_hwloc_topology(void)
			
 
				+{
			
 
				+	struct _starpu_machine_config *config = _starpu_get_machine_config();
			
 
				+
			
 
				+	return config->topology.hwtopology;
			
 
				+}
			
 
				+#endif
			
--- a/src/core/workers.c
+++ b/src/core/workers.c
@@ -2692,31 +2692,37 @@ int starpu_worker_get_relax_state(void)
 
				 	return _starpu_worker_get_relax_state();
			
 
				 }
			
 
				 
			
 
				+#undef starpu_worker_lock
			
 
				 void starpu_worker_lock(int workerid)
			
 
				 {
			
 
				 	_starpu_worker_lock(workerid);
			
 
				 }
			
 
				 
			
 
				+#undef starpu_worker_trylock
			
 
				 int starpu_worker_trylock(int workerid)
			
 
				 {
			
 
				 	return _starpu_worker_trylock(workerid);
			
 
				 }
			
 
				 
			
 
				+#undef starpu_worker_unlock
			
 
				 void starpu_worker_unlock(int workerid)
			
 
				 {
			
 
				 	_starpu_worker_unlock(workerid);
			
 
				 }
			
 
				 
			
 
				+#undef starpu_worker_lock_self
			
 
				 void starpu_worker_lock_self(void)
			
 
				 {
			
 
				 	_starpu_worker_lock_self();
			
 
				 }
			
 
				 
			
 
				+#undef starpu_worker_unlock_self
			
 
				 void starpu_worker_unlock_self(void)
			
 
				 {
			
 
				 	_starpu_worker_unlock_self();
			
 
				 }
			
 
				 
			
 
				+#undef starpu_wake_worker_relax
			
 
				 int starpu_wake_worker_relax(int workerid)
			
 
				 {
			
 
				 	return _starpu_wake_worker_relax(workerid);
			
--- a/src/core/workers.h
+++ b/src/core/workers.h
@@ -1132,6 +1132,7 @@ static inline void _starpu_worker_lock(int workerid)
 
				 		STARPU_PTHREAD_MUTEX_LOCK_SCHED(&worker->sched_mutex);
			
 
				 	}
			
 
				 }
			
 
				+#define starpu_worker_lock _starpu_worker_lock
			
 
				 
			
 
				 static inline int _starpu_worker_trylock(int workerid)
			
 
				 {
			
@@ -1162,6 +1163,7 @@ static inline int _starpu_worker_trylock(int workerid)
 
				 	STARPU_PTHREAD_MUTEX_UNLOCK_SCHED(&cur_worker->sched_mutex);
			
 
				 	return ret;
			
 
				 }
			
 
				+#define starpu_worker_trylock _starpu_worker_trylock
			
 
				 
			
 
				 static inline void _starpu_worker_unlock(int workerid)
			
 
				 {
			
@@ -1174,6 +1176,7 @@ static inline void _starpu_worker_unlock(int workerid)
 
				 		starpu_worker_relax_off();
			
 
				 	}
			
 
				 }
			
 
				+#define starpu_worker_unlock _starpu_worker_unlock
			
 
				 
			
 
				 static inline void _starpu_worker_lock_self(void)
			
 
				 {
			
@@ -1182,6 +1185,7 @@ static inline void _starpu_worker_lock_self(void)
 
				 	STARPU_ASSERT(worker != NULL);
			
 
				 	STARPU_PTHREAD_MUTEX_LOCK_SCHED(&worker->sched_mutex);
			
 
				 }
			
 
				+#define starpu_worker_lock_self _starpu_worker_lock_self
			
 
				 
			
 
				 static inline void _starpu_worker_unlock_self(void)
			
 
				 {
			
@@ -1190,6 +1194,7 @@ static inline void _starpu_worker_unlock_self(void)
 
				 	STARPU_ASSERT(worker != NULL);
			
 
				 	STARPU_PTHREAD_MUTEX_UNLOCK_SCHED(&worker->sched_mutex);
			
 
				 }
			
 
				+#define starpu_worker_unlock_self _starpu_worker_unlock_self
			
 
				 
			
 
				 static inline int _starpu_wake_worker_relax(int workerid)
			
 
				 {
			
@@ -1198,6 +1203,7 @@ static inline int _starpu_wake_worker_relax(int workerid)
 
				 	_starpu_worker_unlock(workerid);
			
 
				 	return ret;
			
 
				 }
			
 
				+#define starpu_wake_worker_relax _starpu_wake_worker_relax
			
 
				 
			
 
				 int starpu_wake_worker_relax_light(int workerid);
			
 
				 
			
--- a/src/datawizard/coherency.c
+++ b/src/datawizard/coherency.c
@@ -411,7 +411,7 @@ int _starpu_determine_request_path(starpu_data_handle_t handle,
 
				 /* handle->lock should be taken. r is returned locked. The node parameter
			
 
				  * indicate either the source of the request, or the destination for a
			
 
				  * write-only request. */
			
 
				-static struct _starpu_data_request *_starpu_search_existing_data_request(struct _starpu_data_replicate *replicate, unsigned node, enum starpu_data_access_mode mode, unsigned is_prefetch)
			
 
				+static struct _starpu_data_request *_starpu_search_existing_data_request(struct _starpu_data_replicate *replicate, unsigned node, enum starpu_data_access_mode mode, enum _starpu_is_prefetch is_prefetch)
			
 
				 {
			
 
				 	struct _starpu_data_request *r;
			
 
				 
			
@@ -474,7 +474,7 @@ static struct _starpu_data_request *_starpu_search_existing_data_request(struct
 
				 
			
 
				 struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_handle_t handle,
			
 
				 								  struct _starpu_data_replicate *dst_replicate,
			
 
				-								  enum starpu_data_access_mode mode, unsigned is_prefetch,
			
 
				+								  enum starpu_data_access_mode mode, enum _starpu_is_prefetch is_prefetch,
			
 
				 								  unsigned async,
			
 
				 								  void (*callback_func)(void *), void *callback_arg, int prio, const char *origin)
			
 
				 {
			
@@ -529,7 +529,13 @@ struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_ha
 
				 #endif
			
 
				 
			
 
				 			if (dst_replicate->mc)
			
 
				+			{
			
 
				+				if (is_prefetch == STARPU_TASK_PREFETCH)
			
 
				+					/* Make sure it stays there */
			
 
				+					dst_replicate->mc->nb_tasks_prefetch++;
			
 
				+
			
 
				 				_starpu_memchunk_recently_used(dst_replicate->mc, requesting_node);
			
 
				+			}
			
 
				 		}
			
 
				 
			
 
				 		_starpu_spin_unlock(&handle->header_lock);
			
@@ -574,6 +580,9 @@ struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_ha
 
				 			if (_starpu_allocate_memory_on_node(handle, dst_replicate, is_prefetch) == 0)
			
 
				 			{
			
 
				 				_starpu_update_data_state(handle, dst_replicate, mode);
			
 
				+				if (is_prefetch == STARPU_TASK_PREFETCH)
			
 
				+					/* Make sure it stays there */
			
 
				+					dst_replicate->mc->nb_tasks_prefetch++;
			
 
				 
			
 
				 				_starpu_spin_unlock(&handle->header_lock);
			
 
				 
			
@@ -652,9 +661,17 @@ struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_ha
 
				 				STARPU_ASSERT(r->next_req_count <= STARPU_MAXNODES);
			
 
				 			}
			
 
				 		}
			
 
				-		else if (!write_invalidation)
			
 
				-			/* The last request will perform the callback after termination */
			
 
				-			_starpu_data_request_append_callback(r, callback_func, callback_arg);
			
 
				+		else
			
 
				+		{
			
 
				+			if (is_prefetch == STARPU_TASK_PREFETCH)
			
 
				+				/* Make last request add the prefetch count on the mc to keep the data
			
 
				+				 * there until the task gets to execute.  */
			
 
				+				r->nb_tasks_prefetch++;
			
 
				+
			
 
				+			if (!write_invalidation)
			
 
				+				/* The last request will perform the callback after termination */
			
 
				+				_starpu_data_request_append_callback(r, callback_func, callback_arg);
			
 
				+		}
			
 
				 
			
 
				 
			
 
				 		if (reused_requests[hop])
			
@@ -719,7 +736,7 @@ struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_ha
 
				 }
			
 
				 
			
 
				 int _starpu_fetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *dst_replicate,
			
 
				-			       enum starpu_data_access_mode mode, unsigned detached, unsigned is_prefetch, unsigned async,
			
 
				+			       enum starpu_data_access_mode mode, unsigned detached, enum _starpu_is_prefetch is_prefetch, unsigned async,
			
 
				 			       void (*callback_func)(void *), void *callback_arg, int prio, const char *origin)
			
 
				 {
			
 
				         _STARPU_LOG_IN();
			
@@ -733,7 +750,7 @@ int _starpu_fetch_data_on_node(starpu_data_handle_t handle, int node, struct _st
 
				 	if (cpt == STARPU_SPIN_MAXTRY)
			
 
				 		_starpu_spin_lock(&handle->header_lock);
			
 
				 
			
 
				-	if (is_prefetch > 0)
			
 
				+	if (is_prefetch > STARPU_FETCH)
			
 
				 	{
			
 
				 		unsigned src_node_mask = 0;
			
 
				 
			
@@ -751,6 +768,7 @@ int _starpu_fetch_data_on_node(starpu_data_handle_t handle, int node, struct _st
 
				 		if (src_node_mask == 0)
			
 
				 		{
			
 
				 			/* no valid copy, nothing to prefetch */
			
 
				+			STARPU_ASSERT_MSG(handle->init_cl, "Could not find a valid copy of the data, and no handle initialization function");
			
 
				 			_starpu_spin_unlock(&handle->header_lock);
			
 
				 			return 0;
			
 
				 		}
			
@@ -789,17 +807,22 @@ int _starpu_fetch_data_on_node(starpu_data_handle_t handle, int node, struct _st
 
				 
			
 
				 static int idle_prefetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate, enum starpu_data_access_mode mode, int prio)
			
 
				 {
			
 
				-	return _starpu_fetch_data_on_node(handle, node, replicate, mode, 1, 2, 1, NULL, NULL, prio, "idle_prefetch_data_on_node");
			
 
				+	return _starpu_fetch_data_on_node(handle, node, replicate, mode, 1, STARPU_IDLEFETCH, 1, NULL, NULL, prio, "idle_prefetch_data_on_node");
			
 
				 }
			
 
				 
			
 
				-static int prefetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate, enum starpu_data_access_mode mode, int prio)
			
 
				+static int task_prefetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate, enum starpu_data_access_mode mode, int prio)
			
 
				 {
			
 
				-	return _starpu_fetch_data_on_node(handle, node, replicate, mode, 1, 1, 1, NULL, NULL, prio, "prefetch_data_on_node");
			
 
				+	return _starpu_fetch_data_on_node(handle, node, replicate, mode, 1, STARPU_TASK_PREFETCH, 1, NULL, NULL, prio, "task_prefetch_data_on_node");
			
 
				+}
			
 
				+
			
 
				+static int STARPU_ATTRIBUTE_UNUSED prefetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate, enum starpu_data_access_mode mode, int prio)
			
 
				+{
			
 
				+	return _starpu_fetch_data_on_node(handle, node, replicate, mode, 1, STARPU_PREFETCH, 1, NULL, NULL, prio, "prefetch_data_on_node");
			
 
				 }
			
 
				 
			
 
				 static int fetch_data(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate, enum starpu_data_access_mode mode, int prio)
			
 
				 {
			
 
				-	return _starpu_fetch_data_on_node(handle, node, replicate, mode, 0, 0, 0, NULL, NULL, prio, "fetch_data");
			
 
				+	return _starpu_fetch_data_on_node(handle, node, replicate, mode, 0, STARPU_FETCH, 0, NULL, NULL, prio, "fetch_data");
			
 
				 }
			
 
				 
			
 
				 uint32_t _starpu_get_data_refcnt(starpu_data_handle_t handle, unsigned node)
			
@@ -899,7 +922,7 @@ int starpu_prefetch_task_input_on_node_prio(struct starpu_task *task, unsigned t
 
				 	if (j->discontinuous != 0)
			
 
				 		return 0;
			
 
				 #endif
			
 
				-	STARPU_ASSERT(!task->prefetched);
			
 
				+	STARPU_ASSERT_MSG(!task->prefetched, "Prefetching was already requested for this task! Did you set 'prefetches' to 1 in the starpu_sched_policy structure?");
			
 
				 	unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
			
 
				 	unsigned index;
			
 
				 
			
@@ -918,10 +941,11 @@ int starpu_prefetch_task_input_on_node_prio(struct starpu_task *task, unsigned t
 
				 		int node = _starpu_task_data_get_node_on_node(task, index, target_node);
			
 
				 
			
 
				 		struct _starpu_data_replicate *replicate = &handle->per_node[node];
			
 
				-		prefetch_data_on_node(handle, node, replicate, mode, prio);
			
 
				+		task_prefetch_data_on_node(handle, node, replicate, mode, prio);
			
 
				 
			
 
				 		_starpu_set_data_requested_flag_if_needed(handle, replicate);
			
 
				 	}
			
 
				+	task->prefetched = 1;
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
@@ -976,7 +1000,7 @@ int starpu_prefetch_task_input_for_prio(struct starpu_task *task, unsigned worke
 
				 	if (j->discontinuous != 0)
			
 
				 		return 0;
			
 
				 #endif
			
 
				-	STARPU_ASSERT(!task->prefetched);
			
 
				+	STARPU_ASSERT_MSG(!task->prefetched, "Prefetching was already requested for this task! Did you set 'prefetches' to 1 in the starpu_sched_policy structure?");
			
 
				 	unsigned nbuffers = STARPU_TASK_GET_NBUFFERS(task);
			
 
				 	unsigned index;
			
 
				 
			
@@ -995,10 +1019,11 @@ int starpu_prefetch_task_input_for_prio(struct starpu_task *task, unsigned worke
 
				 		int node = _starpu_task_data_get_node_on_worker(task, index, worker);
			
 
				 
			
 
				 		struct _starpu_data_replicate *replicate = &handle->per_node[node];
			
 
				-		prefetch_data_on_node(handle, node, replicate, mode, prio);
			
 
				+		task_prefetch_data_on_node(handle, node, replicate, mode, prio);
			
 
				 
			
 
				 		_starpu_set_data_requested_flag_if_needed(handle, replicate);
			
 
				 	}
			
 
				+	task->prefetched = 1;
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
@@ -1140,7 +1165,7 @@ int _starpu_fetch_task_input(struct starpu_task *task, struct _starpu_job *j, in
 
				 
			
 
				 		if (async)
			
 
				 		{
			
 
				-			ret = _starpu_fetch_data_on_node(handle, node, local_replicate, mode, 0, 0, 1,
			
 
				+			ret = _starpu_fetch_data_on_node(handle, node, local_replicate, mode, 0, STARPU_FETCH, 1,
			
 
				 					_starpu_fetch_task_input_cb, worker, 0, "_starpu_fetch_task_input");
			
 
				 #ifdef STARPU_SIMGRID
			
 
				 			if (_starpu_simgrid_fetching_input_cost())
			
@@ -1230,7 +1255,23 @@ void _starpu_fetch_task_input_tail(struct starpu_task *task, struct _starpu_job
 
				 		local_replicate = get_replicate(handle, mode, workerid, node);
			
 
				 		_starpu_spin_lock(&handle->header_lock);
			
 
				 		if (local_replicate->mc)
			
 
				+		{
			
 
				 			local_replicate->mc->diduse = 1;
			
 
				+			if (task->prefetched && local_replicate->initialized &&
			
 
				+				/* See prefetch conditions in
			
 
				+				 * starpu_prefetch_task_input_on_node_prio and alike */
			
 
				+				!(mode & (STARPU_SCRATCH|STARPU_REDUX)) &&
			
 
				+				(mode & STARPU_R))
			
 
				+			{
			
 
				+				/* Allocations or transfer prefetchs should have been done by now and marked
			
 
				+				 * this mc as needed for us.
			
 
				+				 * Now that we added a reference for the task, we can relieve that.  */
			
 
				+				/* Note: the replicate might have been evicted in between, thus not 100% sure
			
 
				+				 * that our prefetch request is still recorded here.  */
			
 
				+				if (local_replicate->mc->nb_tasks_prefetch > 0)
			
 
				+					local_replicate->mc->nb_tasks_prefetch--;
			
 
				+			}
			
 
				+		}
			
 
				 		_starpu_spin_unlock(&handle->header_lock);
			
 
				 
			
 
				 		_STARPU_TASK_SET_INTERFACE(task , local_replicate->data_interface, descrs[index].index);
			
@@ -1379,7 +1420,7 @@ void _starpu_fetch_nowhere_task_input(struct _starpu_job *j)
 
				 
			
 
				 		local_replicate = get_replicate(handle, mode, -1, node);
			
 
				 
			
 
				-		_starpu_fetch_data_on_node(handle, node, local_replicate, mode, 0, 0, 1, _starpu_fetch_nowhere_task_input_cb, wrapper, 0, "_starpu_fetch_nowhere_task_input");
			
 
				+		_starpu_fetch_data_on_node(handle, node, local_replicate, mode, 0, STARPU_FETCH, 1, _starpu_fetch_nowhere_task_input_cb, wrapper, 0, "_starpu_fetch_nowhere_task_input");
			
 
				 	}
			
 
				 
			
 
				 	if (profiling && task->profiling_info)
			
--- a/src/datawizard/coherency.h
+++ b/src/datawizard/coherency.h
@@ -298,7 +298,7 @@ struct _starpu_data_state
 
				  * async means that _starpu_fetch_data_on_node will wait for completion of the request
			
 
				  */
			
 
				 int _starpu_fetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate,
			
 
				-			       enum starpu_data_access_mode mode, unsigned detached, unsigned is_prefetch, unsigned async,
			
 
				+			       enum starpu_data_access_mode mode, unsigned detached, enum _starpu_is_prefetch is_prefetch, unsigned async,
			
 
				 			       void (*callback_func)(void *), void *callback_arg, int prio, const char *origin);
			
 
				 /** This releases a reference on the handle */
			
 
				 void _starpu_release_data_on_node(struct _starpu_data_state *state, uint32_t default_wt_mask,
			
@@ -341,7 +341,7 @@ int _starpu_determine_request_path(starpu_data_handle_t handle,
 
				  */
			
 
				 struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_handle_t handle,
			
 
				 								  struct _starpu_data_replicate *dst_replicate,
			
 
				-								  enum starpu_data_access_mode mode, unsigned is_prefetch,
			
 
				+								  enum starpu_data_access_mode mode, enum _starpu_is_prefetch is_prefetch,
			
 
				 								  unsigned async,
			
 
				 								  void (*callback_func)(void *), void *callback_arg, int prio, const char *origin);
			
 
				 
			
--- a/src/datawizard/copy_driver.c
+++ b/src/datawizard/copy_driver.c
@@ -203,7 +203,7 @@ int STARPU_ATTRIBUTE_WARN_UNUSED_RESULT _starpu_driver_copy_data_1_to_1(starpu_d
 
				 									unsigned donotread,
			
 
				 									struct _starpu_data_request *req,
			
 
				 									unsigned may_alloc,
			
 
				-									unsigned prefetch STARPU_ATTRIBUTE_UNUSED)
			
 
				+									enum _starpu_is_prefetch prefetch STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				 	if (!donotread)
			
 
				 	{
			
@@ -221,7 +221,7 @@ int STARPU_ATTRIBUTE_WARN_UNUSED_RESULT _starpu_driver_copy_data_1_to_1(starpu_d
 
				 			/* We're not supposed to allocate there at the moment */
			
 
				 			return -ENOMEM;
			
 
				 
			
 
				-		int ret_alloc = _starpu_allocate_memory_on_node(handle, dst_replicate, req ? req->prefetch : 0);
			
 
				+		int ret_alloc = _starpu_allocate_memory_on_node(handle, dst_replicate, req ? req->prefetch : STARPU_FETCH);
			
 
				 		if (ret_alloc)
			
 
				 			return -ENOMEM;
			
 
				 	}
			
--- a/src/datawizard/copy_driver.h
+++ b/src/datawizard/copy_driver.h
@@ -47,6 +47,15 @@ extern "C"
 
				 struct _starpu_data_request;
			
 
				 struct _starpu_data_replicate;
			
 
				 
			
 
				+enum _starpu_is_prefetch
			
 
				+{
			
 
				+	STARPU_FETCH = 0,		/* A task really needs it now! */
			
 
				+	STARPU_TASK_PREFETCH = 1,	/* A task will need it soon */
			
 
				+	STARPU_PREFETCH = 2,		/* It is a good idea to have it asap */
			
 
				+	STARPU_IDLEFETCH = 3,		/* Get this here when you have time to */
			
 
				+	STARPU_NFETCH
			
 
				+};
			
 
				+
			
 
				 #ifdef STARPU_USE_MIC
			
 
				 /** MIC needs memory_node to know which MIC is concerned.
			
 
				  * mark is used to wait asynchronous request.
			
@@ -132,7 +141,7 @@ int _starpu_driver_copy_data_1_to_1(starpu_data_handle_t handle,
 
				 				    unsigned donotread,
			
 
				 				    struct _starpu_data_request *req,
			
 
				 				    unsigned may_alloc,
			
 
				-				    unsigned prefetch);
			
 
				+				    enum _starpu_is_prefetch prefetch);
			
 
				 
			
 
				 unsigned _starpu_driver_test_request_completion(struct _starpu_async_channel *async_channel);
			
 
				 void _starpu_driver_wait_request_completion(struct _starpu_async_channel *async_channel);
			
--- a/src/datawizard/data_request.c
+++ b/src/datawizard/data_request.c
@@ -25,8 +25,11 @@
 
				 #include <core/simgrid.h>
			
 
				 
			
 
				 /* requests that have not been treated at all */
			
 
				+#ifdef STARPU_DEVEL
			
 
				+#warning split into separate out/in queues for each node, so that MAX_PENDING_REQUESTS_PER_NODE is separate for them, since the links are bidirectionnal
			
 
				+#endif
			
 
				 static struct _starpu_data_request_prio_list data_requests[STARPU_MAXNODES];
			
 
				-static struct _starpu_data_request_prio_list prefetch_requests[STARPU_MAXNODES];
			
 
				+static struct _starpu_data_request_prio_list prefetch_requests[STARPU_MAXNODES]; /* Contains both task_prefetch and prefetch */
			
 
				 static struct _starpu_data_request_prio_list idle_requests[STARPU_MAXNODES];
			
 
				 static starpu_pthread_mutex_t data_requests_list_mutex[STARPU_MAXNODES];
			
 
				 
			
@@ -121,7 +124,7 @@ struct _starpu_data_request *_starpu_create_data_request(starpu_data_handle_t ha
 
				 							 int handling_node,
			
 
				 							 enum starpu_data_access_mode mode,
			
 
				 							 unsigned ndeps,
			
 
				-							 unsigned is_prefetch,
			
 
				+							 enum _starpu_is_prefetch is_prefetch,
			
 
				 							 int prio,
			
 
				 							 unsigned is_write_invalidation,
			
 
				 							 const char *origin)
			
@@ -153,6 +156,7 @@ struct _starpu_data_request *_starpu_create_data_request(starpu_data_handle_t ha
 
				 	STARPU_ASSERT(starpu_node_get_kind(handling_node) == STARPU_CPU_RAM || _starpu_memory_node_get_nworkers(handling_node));
			
 
				 	r->completed = 0;
			
 
				 	r->prefetch = is_prefetch;
			
 
				+	r->nb_tasks_prefetch = 0;
			
 
				 	r->prio = prio;
			
 
				 	r->retval = -1;
			
 
				 	r->ndeps = ndeps;
			
@@ -307,9 +311,9 @@ void _starpu_post_data_request(struct _starpu_data_request *r)
 
				 
			
 
				 	/* insert the request in the proper list */
			
 
				 	STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[handling_node]);
			
 
				-	if (r->prefetch == 2)
			
 
				+	if (r->prefetch >= STARPU_IDLEFETCH)
			
 
				 		_starpu_data_request_prio_list_push_back(&idle_requests[handling_node], r);
			
 
				-	else if (r->prefetch)
			
 
				+	else if (r->prefetch > STARPU_FETCH)
			
 
				 		_starpu_data_request_prio_list_push_back(&prefetch_requests[handling_node], r);
			
 
				 	else
			
 
				 		_starpu_data_request_prio_list_push_back(&data_requests[handling_node], r);
			
@@ -410,6 +414,10 @@ static void starpu_handle_data_request_completion(struct _starpu_data_request *r
 
				 	/* Remove a reference on the destination replicate for the request */
			
 
				 	if (dst_replicate)
			
 
				 	{
			
 
				+		if (dst_replicate->mc)
			
 
				+			/* Make sure it stays there for the task.  */
			
 
				+			dst_replicate->mc->nb_tasks_prefetch += r->nb_tasks_prefetch;
			
 
				+
			
 
				 		STARPU_ASSERT(dst_replicate->refcnt > 0);
			
 
				 		dst_replicate->refcnt--;
			
 
				 	}
			
@@ -460,7 +468,7 @@ static void starpu_handle_data_request_completion(struct _starpu_data_request *r
 
				 }
			
 
				 
			
 
				 /* TODO : accounting to see how much time was spent working for other people ... */
			
 
				-static int starpu_handle_data_request(struct _starpu_data_request *r, unsigned may_alloc, int prefetch)
			
 
				+static int starpu_handle_data_request(struct _starpu_data_request *r, unsigned may_alloc, enum _starpu_is_prefetch prefetch)
			
 
				 {
			
 
				 	starpu_data_handle_t handle = r->handle;
			
 
				 
			
@@ -535,7 +543,7 @@ static int starpu_handle_data_request(struct _starpu_data_request *r, unsigned m
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-static int __starpu_handle_node_data_requests(struct _starpu_data_request_prio_list *reqlist, unsigned src_node, unsigned may_alloc, unsigned n, unsigned *pushed, unsigned prefetch)
			
 
				+static int __starpu_handle_node_data_requests(struct _starpu_data_request_prio_list *reqlist, unsigned src_node, unsigned may_alloc, unsigned n, unsigned *pushed, enum _starpu_is_prefetch prefetch)
			
 
				 {
			
 
				 	struct _starpu_data_request *r;
			
 
				 	struct _starpu_data_request_prio_list new_data_requests[prefetch + 1]; /* Indexed by prefetch level */
			
@@ -606,7 +614,7 @@ static int __starpu_handle_node_data_requests(struct _starpu_data_request_prio_l
 
				 			ret = res;
			
 
				 			/* Prefetch requests might have gotten promoted while in tmp list */
			
 
				 			_starpu_data_request_prio_list_push_back(&new_data_requests[r->prefetch], r);
			
 
				-			if (prefetch)
			
 
				+			if (prefetch > STARPU_FETCH)
			
 
				 				/* Prefetching more there would make the situation even worse */
			
 
				 				break;
			
 
				 		}
			
@@ -636,20 +644,25 @@ static int __starpu_handle_node_data_requests(struct _starpu_data_request_prio_l
 
				 	if (i <= prefetch)
			
 
				 	{
			
 
				 		STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[src_node]);
			
 
				-		if (!(_starpu_data_request_prio_list_empty(&new_data_requests[0])))
			
 
				+		if (!(_starpu_data_request_prio_list_empty(&new_data_requests[STARPU_FETCH])))
			
 
				+		{
			
 
				+			_starpu_data_request_prio_list_push_prio_list_back(&new_data_requests[STARPU_FETCH], &data_requests[src_node]);
			
 
				+			data_requests[src_node] = new_data_requests[STARPU_FETCH];
			
 
				+		}
			
 
				+		if (prefetch >= STARPU_TASK_PREFETCH && !(_starpu_data_request_prio_list_empty(&new_data_requests[STARPU_TASK_PREFETCH])))
			
 
				 		{
			
 
				-			_starpu_data_request_prio_list_push_prio_list_back(&new_data_requests[0], &data_requests[src_node]);
			
 
				-			data_requests[src_node] = new_data_requests[0];
			
 
				+			_starpu_data_request_prio_list_push_prio_list_back(&new_data_requests[STARPU_TASK_PREFETCH], &prefetch_requests[src_node]);
			
 
				+			prefetch_requests[src_node] = new_data_requests[STARPU_TASK_PREFETCH];
			
 
				 		}
			
 
				-		if (prefetch >= 1 && !(_starpu_data_request_prio_list_empty(&new_data_requests[1])))
			
 
				+		if (prefetch >= STARPU_PREFETCH && !(_starpu_data_request_prio_list_empty(&new_data_requests[STARPU_PREFETCH])))
			
 
				 		{
			
 
				-			_starpu_data_request_prio_list_push_prio_list_back(&new_data_requests[1], &prefetch_requests[src_node]);
			
 
				-			prefetch_requests[src_node] = new_data_requests[1];
			
 
				+			_starpu_data_request_prio_list_push_prio_list_back(&new_data_requests[STARPU_PREFETCH], &prefetch_requests[src_node]);
			
 
				+			prefetch_requests[src_node] = new_data_requests[STARPU_PREFETCH];
			
 
				 		}
			
 
				-		if (prefetch >= 2 && !(_starpu_data_request_prio_list_empty(&new_data_requests[2])))
			
 
				+		if (prefetch >= STARPU_IDLEFETCH && !(_starpu_data_request_prio_list_empty(&new_data_requests[STARPU_IDLEFETCH])))
			
 
				 		{
			
 
				-			_starpu_data_request_prio_list_push_prio_list_back(&new_data_requests[2], &idle_requests[src_node]);
			
 
				-			idle_requests[src_node] = new_data_requests[2];
			
 
				+			_starpu_data_request_prio_list_push_prio_list_back(&new_data_requests[STARPU_IDLEFETCH], &idle_requests[src_node]);
			
 
				+			idle_requests[src_node] = new_data_requests[STARPU_IDLEFETCH];
			
 
				 		}
			
 
				 		STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[src_node]);
			
 
				 
			
@@ -675,17 +688,17 @@ static int __starpu_handle_node_data_requests(struct _starpu_data_request_prio_l
 
				 
			
 
				 int _starpu_handle_node_data_requests(unsigned src_node, unsigned may_alloc, unsigned *pushed)
			
 
				 {
			
 
				-	return __starpu_handle_node_data_requests(data_requests, src_node, may_alloc, MAX_PENDING_REQUESTS_PER_NODE, pushed, 0);
			
 
				+	return __starpu_handle_node_data_requests(data_requests, src_node, may_alloc, MAX_PENDING_REQUESTS_PER_NODE, pushed, STARPU_FETCH);
			
 
				 }
			
 
				 
			
 
				 int _starpu_handle_node_prefetch_requests(unsigned src_node, unsigned may_alloc, unsigned *pushed)
			
 
				 {
			
 
				-	return __starpu_handle_node_data_requests(prefetch_requests, src_node, may_alloc, MAX_PENDING_PREFETCH_REQUESTS_PER_NODE, pushed, 1);
			
 
				+	return __starpu_handle_node_data_requests(prefetch_requests, src_node, may_alloc, MAX_PENDING_PREFETCH_REQUESTS_PER_NODE, pushed, STARPU_PREFETCH);
			
 
				 }
			
 
				 
			
 
				 int _starpu_handle_node_idle_requests(unsigned src_node, unsigned may_alloc, unsigned *pushed)
			
 
				 {
			
 
				-	return __starpu_handle_node_data_requests(idle_requests, src_node, may_alloc, MAX_PENDING_IDLE_REQUESTS_PER_NODE, pushed, 2);
			
 
				+	return __starpu_handle_node_data_requests(idle_requests, src_node, may_alloc, MAX_PENDING_IDLE_REQUESTS_PER_NODE, pushed, STARPU_IDLEFETCH);
			
 
				 }
			
 
				 
			
 
				 static int _handle_pending_node_data_requests(unsigned src_node, unsigned force)
			
@@ -836,11 +849,15 @@ int _starpu_check_that_no_data_request_is_pending(unsigned node)
 
				 }
			
 
				 
			
 
				 
			
 
				-void _starpu_update_prefetch_status(struct _starpu_data_request *r, unsigned prefetch)
			
 
				+void _starpu_update_prefetch_status(struct _starpu_data_request *r, enum _starpu_is_prefetch prefetch)
			
 
				 {
			
 
				 	STARPU_ASSERT(r->prefetch > prefetch);
			
 
				 	r->prefetch=prefetch;
			
 
				 
			
 
				+	if (prefetch >= STARPU_IDLEFETCH)
			
 
				+		/* No possible actual change */
			
 
				+		return;
			
 
				+
			
 
				 	/* We have to promote chained_request too! */
			
 
				 	unsigned chained_req;
			
 
				 	for (chained_req = 0; chained_req < r->next_req_count; chained_req++)
			
@@ -852,19 +869,20 @@ void _starpu_update_prefetch_status(struct _starpu_data_request *r, unsigned pre
 
				 
			
 
				 	STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[r->handling_node]);
			
 
				 
			
 
				+	int found = 1;
			
 
				+
			
 
				 	/* The request can be in a different list (handling request or the temp list)
			
 
				-	 * we have to check that it is really in the prefetch list. */
			
 
				+	 * we have to check that it is really in the prefetch or idle list. */
			
 
				 	if (_starpu_data_request_prio_list_ismember(&prefetch_requests[r->handling_node], r))
			
 
				-	{
			
 
				-		_starpu_data_request_prio_list_erase(&prefetch_requests[r->handling_node],r);
			
 
				-		_starpu_data_request_prio_list_push_back(&data_requests[r->handling_node],r);
			
 
				-	}
			
 
				-	/* The request can be in a different list (handling request or the temp list)
			
 
				-	 * we have to check that it is really in the idle list. */
			
 
				+		_starpu_data_request_prio_list_erase(&prefetch_requests[r->handling_node], r);
			
 
				 	else if (_starpu_data_request_prio_list_ismember(&idle_requests[r->handling_node], r))
			
 
				+		_starpu_data_request_prio_list_erase(&idle_requests[r->handling_node], r);
			
 
				+	else
			
 
				+		found = 0;
			
 
				+
			
 
				+	if (found)
			
 
				 	{
			
 
				-		_starpu_data_request_prio_list_erase(&idle_requests[r->handling_node],r);
			
 
				-		if (prefetch == 1)
			
 
				+		if (prefetch > STARPU_FETCH)
			
 
				 			_starpu_data_request_prio_list_push_back(&prefetch_requests[r->handling_node],r);
			
 
				 		else
			
 
				 			_starpu_data_request_prio_list_push_back(&data_requests[r->handling_node],r);
			
--- a/src/datawizard/data_request.h
+++ b/src/datawizard/data_request.h
@@ -79,12 +79,11 @@ LIST_TYPE(_starpu_data_request,
 
				 	/** Whether the transfer is completed. */
			
 
				 	unsigned completed;
			
 
				 
			
 
				-	/** Whether this is just a prefetch request:
			
 
				-	 * 0 for fetch,
			
 
				-	 * 1 for prefetch (dependencies have just been released)
			
 
				-	 * 2 for idle (a good idea to do it some time, but no hurry at all)
			
 
				-	 */
			
 
				-	unsigned prefetch;
			
 
				+	/** Whether this is just a prefetch request */
			
 
				+	enum _starpu_is_prefetch prefetch;
			
 
				+
			
 
				+	/** Number of tasks which used this as a prefetch */
			
 
				+	unsigned nb_tasks_prefetch;
			
 
				 
			
 
				 	/** Priority of the request. Default is 0 */
			
 
				 	int prio;
			
@@ -151,7 +150,7 @@ struct _starpu_data_request *_starpu_create_data_request(starpu_data_handle_t ha
 
				 							 int handling_node,
			
 
				 							 enum starpu_data_access_mode mode,
			
 
				 							 unsigned ndeps,
			
 
				-							 unsigned is_prefetch,
			
 
				+							 enum _starpu_is_prefetch is_prefetch,
			
 
				 							 int prio,
			
 
				 							 unsigned is_write_invalidation,
			
 
				 							 const char *origin) STARPU_ATTRIBUTE_MALLOC;
			
@@ -162,5 +161,5 @@ void _starpu_data_request_append_callback(struct _starpu_data_request *r,
 
				 					  void (*callback_func)(void *),
			
 
				 					  void *callback_arg);
			
 
				 
			
 
				-void _starpu_update_prefetch_status(struct _starpu_data_request *r, unsigned prefetch);
			
 
				+void _starpu_update_prefetch_status(struct _starpu_data_request *r, enum _starpu_is_prefetch prefetch);
			
 
				 #endif // __DATA_REQUEST_H__
			
--- a/src/datawizard/filters.c
+++ b/src/datawizard/filters.c
@@ -21,6 +21,7 @@
 
				 #include <datawizard/filters.h>
			
 
				 #include <datawizard/footprint.h>
			
 
				 #include <datawizard/interfaces/data_interface.h>
			
 
				+#include <datawizard/memory_nodes.h>
			
 
				 #include <core/task.h>
			
 
				 
			
 
				 /*
			
@@ -192,7 +193,7 @@ static void _starpu_data_partition(starpu_data_handle_t initial_handle, starpu_d
 
				 		int home_node = initial_handle->home_node;
			
 
				 		if (home_node < 0 || (starpu_node_get_kind(home_node) != STARPU_CPU_RAM))
			
 
				 			home_node = STARPU_MAIN_RAM;
			
 
				-		int ret = _starpu_allocate_memory_on_node(initial_handle, &initial_handle->per_node[home_node], 0);
			
 
				+		int ret = _starpu_allocate_memory_on_node(initial_handle, &initial_handle->per_node[home_node], STARPU_FETCH);
			
 
				 #ifdef STARPU_DEVEL
			
 
				 #warning we should reclaim memory if allocation failed
			
 
				 #endif
			
--- a/src/datawizard/interfaces/bcsr_interface.c
+++ b/src/datawizard/interfaces/bcsr_interface.c
@@ -15,6 +15,9 @@
 
				  */
			
 
				 
			
 
				 #include <starpu.h>
			
 
				+#ifdef BUILDING_STARPU
			
 
				+#include <datawizard/memory_nodes.h>
			
 
				+#endif
			
 
				 
			
 
				 /*
			
 
				  * BCSR : blocked CSR, we use blocks of size (r x c)
			
--- a/src/datawizard/interfaces/block_interface.c
+++ b/src/datawizard/interfaces/block_interface.c
@@ -15,6 +15,9 @@
 
				  */
			
 
				 
			
 
				 #include <starpu.h>
			
 
				+#ifdef BUILDING_STARPU
			
 
				+#include <datawizard/memory_nodes.h>
			
 
				+#endif
			
 
				 
			
 
				 static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
			
 
				 
			
--- a/src/datawizard/interfaces/coo_interface.c
+++ b/src/datawizard/interfaces/coo_interface.c
@@ -15,6 +15,9 @@
 
				  */
			
 
				 
			
 
				 #include <starpu.h>
			
 
				+#ifdef BUILDING_STARPU
			
 
				+#include <datawizard/memory_nodes.h>
			
 
				+#endif
			
 
				 
			
 
				 static int
			
 
				 copy_any_to_any(void *src_interface, unsigned src_node,
			
--- a/src/datawizard/interfaces/csr_interface.c
+++ b/src/datawizard/interfaces/csr_interface.c
@@ -16,6 +16,9 @@
 
				  */
			
 
				 
			
 
				 #include <starpu.h>
			
 
				+#ifdef BUILDING_STARPU
			
 
				+#include <datawizard/memory_nodes.h>
			
 
				+#endif
			
 
				 
			
 
				 static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
			
 
				 
			
--- a/src/datawizard/interfaces/data_interface.c
+++ b/src/datawizard/interfaces/data_interface.c
@@ -693,7 +693,7 @@ void _starpu_check_if_valid_and_fetch_data_on_node(starpu_data_handle_t handle,
 
				 	}
			
 
				 	if (valid)
			
 
				 	{
			
 
				-		int ret = _starpu_fetch_data_on_node(handle, handle->home_node, replicate, STARPU_R, 0, 0, 0, NULL, NULL, 0, origin);
			
 
				+		int ret = _starpu_fetch_data_on_node(handle, handle->home_node, replicate, STARPU_R, 0, STARPU_FETCH, 0, NULL, NULL, 0, origin);
			
 
				 		STARPU_ASSERT(!ret);
			
 
				 		_starpu_release_data_on_node(handle, handle->home_node, replicate);
			
 
				 	}
			
--- a/src/datawizard/interfaces/matrix_interface.c
+++ b/src/datawizard/interfaces/matrix_interface.c
@@ -15,6 +15,9 @@
 
				  */
			
 
				 
			
 
				 #include <starpu.h>
			
 
				+#ifdef BUILDING_STARPU
			
 
				+#include <datawizard/memory_nodes.h>
			
 
				+#endif
			
 
				 
			
 
				 static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
			
 
				 
			
--- a/src/datawizard/interfaces/multiformat_interface.c
+++ b/src/datawizard/interfaces/multiformat_interface.c
@@ -15,6 +15,9 @@
 
				  */
			
 
				 
			
 
				 #include <starpu.h>
			
 
				+#ifdef BUILDING_STARPU
			
 
				+#include <datawizard/memory_nodes.h>
			
 
				+#endif
			
 
				 
			
 
				 static int copy_ram_to_ram(void *src_interface, unsigned src_node STARPU_ATTRIBUTE_UNUSED, void *dst_interface, unsigned dst_node);
			
 
				 #ifdef STARPU_USE_CUDA
			
--- a/src/datawizard/interfaces/tensor_interface.c
+++ b/src/datawizard/interfaces/tensor_interface.c
@@ -15,6 +15,9 @@
 
				  */
			
 
				 
			
 
				 #include <starpu.h>
			
 
				+#ifdef BUILDING_STARPU
			
 
				+#include <datawizard/memory_nodes.h>
			
 
				+#endif
			
 
				 
			
 
				 static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
			
 
				 
			
--- a/src/datawizard/interfaces/variable_interface.c
+++ b/src/datawizard/interfaces/variable_interface.c
@@ -15,6 +15,9 @@
 
				  */
			
 
				 
			
 
				 #include <starpu.h>
			
 
				+#ifdef BUILDING_STARPU
			
 
				+#include <datawizard/memory_nodes.h>
			
 
				+#endif
			
 
				 
			
 
				 static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
			
 
				 
			
--- a/src/datawizard/interfaces/vector_interface.c
+++ b/src/datawizard/interfaces/vector_interface.c
@@ -15,6 +15,9 @@
 
				  */
			
 
				 
			
 
				 #include <starpu.h>
			
 
				+#ifdef BUILDING_STARPU
			
 
				+#include <datawizard/memory_nodes.h>
			
 
				+#endif
			
 
				 
			
 
				 static int copy_any_to_any(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
			
 
				 
			
--- a/src/datawizard/interfaces/void_interface.c
+++ b/src/datawizard/interfaces/void_interface.c
@@ -15,6 +15,9 @@
 
				  */
			
 
				 
			
 
				 #include <starpu.h>
			
 
				+#ifdef BUILDING_STARPU
			
 
				+#include <datawizard/memory_nodes.h>
			
 
				+#endif
			
 
				 
			
 
				 static int dummy_copy(void *src_interface, unsigned src_node, void *dst_interface, unsigned dst_node, void *async_data);
			
 
				 
			
--- a/src/datawizard/memalloc.c
+++ b/src/datawizard/memalloc.c
@@ -322,7 +322,7 @@ static int STARPU_ATTRIBUTE_WARN_UNUSED_RESULT transfer_subtree_to_node(starpu_d
 
				 		{
			
 
				 			/* This is the only copy, push it to destination */
			
 
				 			struct _starpu_data_request *r;
			
 
				-			r = _starpu_create_request_to_fetch_data(handle, dst_replicate, STARPU_R, 0, 0, NULL, NULL, 0, "transfer_subtree_to_node");
			
 
				+			r = _starpu_create_request_to_fetch_data(handle, dst_replicate, STARPU_R, STARPU_FETCH, 0, NULL, NULL, 0, "transfer_subtree_to_node");
			
 
				 			/* There is no way we don't need a request, since
			
 
				 			 * source is OWNER, destination can't be having it */
			
 
				 			STARPU_ASSERT(r);
			
@@ -546,7 +546,7 @@ static void reuse_mem_chunk(unsigned node, struct _starpu_data_replicate *new_re
 
				 /* This function is called for memory chunks that are possibly in used (ie. not
			
 
				  * in the cache). They should therefore still be associated to a handle. */
			
 
				 /* mc_lock is held and may be temporarily released! */
			
 
				-static size_t try_to_throw_mem_chunk(struct _starpu_mem_chunk *mc, unsigned node, struct _starpu_data_replicate *replicate, unsigned is_already_in_mc_list)
			
 
				+static size_t try_to_throw_mem_chunk(struct _starpu_mem_chunk *mc, unsigned node, struct _starpu_data_replicate *replicate, unsigned is_already_in_mc_list, enum _starpu_is_prefetch is_prefetch)
			
 
				 {
			
 
				 	size_t freed = 0;
			
 
				 
			
@@ -571,6 +571,10 @@ static size_t try_to_throw_mem_chunk(struct _starpu_mem_chunk *mc, unsigned node
 
				 		/* Hasn't been used yet, avoid evicting it */
			
 
				 		return 0;
			
 
				 
			
 
				+	if (mc->nb_tasks_prefetch && is_prefetch >= STARPU_TASK_PREFETCH)
			
 
				+		/* We have not finished executing the tasks this was prefetched for */
			
 
				+		return 0;
			
 
				+
			
 
				 	/* REDUX memchunk */
			
 
				 	if (mc->relaxed_coherency == 2)
			
 
				 	{
			
@@ -782,7 +786,7 @@ static int try_to_find_reusable_mc(unsigned node, starpu_data_handle_t data, str
 
				 
			
 
				 /* this function looks for a memory chunk that matches a given footprint in the
			
 
				  * list of mem chunk that are not important */
			
 
				-static int try_to_reuse_not_important_mc(unsigned node, starpu_data_handle_t data, struct _starpu_data_replicate *replicate, uint32_t footprint)
			
 
				+static int try_to_reuse_not_important_mc(unsigned node, starpu_data_handle_t data, struct _starpu_data_replicate *replicate, uint32_t footprint, enum _starpu_is_prefetch is_prefetch)
			
 
				 {
			
 
				 	struct _starpu_mem_chunk *mc, *orig_next_mc, *next_mc;
			
 
				 	int success = 0;
			
@@ -816,7 +820,7 @@ restart:
 
				 		}
			
 
				 
			
 
				 		/* Note: this may unlock mc_list! */
			
 
				-		success = try_to_throw_mem_chunk(mc, node, replicate, 1);
			
 
				+		success = try_to_throw_mem_chunk(mc, node, replicate, 1, is_prefetch);
			
 
				 
			
 
				 		if (orig_next_mc)
			
 
				 		{
			
@@ -841,11 +845,14 @@ restart:
 
				  * Try to find a buffer currently in use on the memory node which has the given
			
 
				  * footprint.
			
 
				  */
			
 
				-static int try_to_reuse_potentially_in_use_mc(unsigned node, starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, uint32_t footprint, int is_prefetch)
			
 
				+static int try_to_reuse_potentially_in_use_mc(unsigned node, starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, uint32_t footprint, enum _starpu_is_prefetch is_prefetch)
			
 
				 {
			
 
				 	struct _starpu_mem_chunk *mc, *next_mc, *orig_next_mc;
			
 
				 	int success = 0;
			
 
				 
			
 
				+	if (is_prefetch >= STARPU_IDLEFETCH)
			
 
				+		/* Do not evict a MC just for an idle fetch */
			
 
				+		return 0;
			
 
				 	/*
			
 
				 	 * We have to unlock mc_lock before locking header_lock, so we have
			
 
				 	 * to be careful with the list.  We try to do just one pass, by
			
@@ -868,14 +875,11 @@ restart:
 
				 		if (mc->remove_notify)
			
 
				 			/* Somebody already working here, skip */
			
 
				 			continue;
			
 
				-		if (is_prefetch > 1)
			
 
				-			/* Do not evict a MC just for an idle fetch */
			
 
				-			continue;
			
 
				-		if (is_prefetch == 1 && !mc->wontuse)
			
 
				+		if (!mc->wontuse && is_prefetch >= STARPU_PREFETCH)
			
 
				 			/* Do not evict something that we might reuse, just for a prefetch */
			
 
				-			/* FIXME: but perhaps we won't have any task using it in
			
 
				-                         * the close future, we should perhaps rather check
			
 
				-                         * mc->replicate->refcnt? */
			
 
				+			continue;
			
 
				+		if (mc->nb_tasks_prefetch && is_prefetch >= STARPU_TASK_PREFETCH)
			
 
				+			/* Do not evict something that we will reuse, just for a task prefetch */
			
 
				 			continue;
			
 
				 		if (mc->footprint != footprint || _starpu_data_interface_compare(handle->per_node[node].data_interface, handle->ops, mc->data->per_node[node].data_interface, mc->ops) != 1)
			
 
				 			/* Not the right type of interface, skip */
			
@@ -889,7 +893,7 @@ restart:
 
				 		}
			
 
				 
			
 
				 		/* Note: this may unlock mc_list! */
			
 
				-		success = try_to_throw_mem_chunk(mc, node, replicate, 1);
			
 
				+		success = try_to_throw_mem_chunk(mc, node, replicate, 1, is_prefetch);
			
 
				 
			
 
				 		if (orig_next_mc)
			
 
				 		{
			
@@ -999,7 +1003,7 @@ restart2:
 
				 				next_mc->remove_notify = &next_mc;
			
 
				 			}
			
 
				 			/* Note: this may unlock mc_list! */
			
 
				-			freed += try_to_throw_mem_chunk(mc, node, NULL, 0);
			
 
				+			freed += try_to_throw_mem_chunk(mc, node, NULL, 0, STARPU_FETCH);
			
 
				 
			
 
				 			if (orig_next_mc)
			
 
				 			{
			
@@ -1218,7 +1222,7 @@ void starpu_memchunk_tidy(unsigned node)
 
				 			}
			
 
				 
			
 
				 			_starpu_spin_unlock(&mc_lock[node]);
			
 
				-			if (!_starpu_create_request_to_fetch_data(handle, &handle->per_node[target_node], STARPU_R, 2, 1, NULL, NULL, 0, "starpu_memchunk_tidy"))
			
 
				+			if (!_starpu_create_request_to_fetch_data(handle, &handle->per_node[target_node], STARPU_R, STARPU_IDLEFETCH, 1, NULL, NULL, 0, "starpu_memchunk_tidy"))
			
 
				 			{
			
 
				 				/* No request was actually needed??
			
 
				 				 * Odd, but cope with it.  */
			
@@ -1317,6 +1321,7 @@ static struct _starpu_mem_chunk *_starpu_memchunk_init(struct _starpu_data_repli
 
				 	mc->size_interface = interface_size;
			
 
				 	mc->remove_notify = NULL;
			
 
				 	mc->diduse = 0;
			
 
				+	mc->nb_tasks_prefetch = 0;
			
 
				 	mc->wontuse = 0;
			
 
				 
			
 
				 	return mc;
			
@@ -1430,7 +1435,7 @@ void _starpu_request_mem_chunk_removal(starpu_data_handle_t handle, struct _star
 
				  *
			
 
				  */
			
 
				 
			
 
				-static starpu_ssize_t _starpu_allocate_interface(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, unsigned dst_node, unsigned is_prefetch)
			
 
				+static starpu_ssize_t _starpu_allocate_interface(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, unsigned dst_node, enum _starpu_is_prefetch is_prefetch)
			
 
				 {
			
 
				 	unsigned attempts = 0;
			
 
				 	starpu_ssize_t allocated_memory;
			
@@ -1514,7 +1519,7 @@ static starpu_ssize_t _starpu_allocate_interface(starpu_data_handle_t handle, st
 
				 			reclaim -= freed;
			
 
				 
			
 
				 			/* Try to reuse an allocated data with the same interface (to avoid spurious free/alloc) */
			
 
				-			if (_starpu_has_not_important_data && try_to_reuse_not_important_mc(dst_node, handle, replicate, footprint))
			
 
				+			if (_starpu_has_not_important_data && try_to_reuse_not_important_mc(dst_node, handle, replicate, footprint, is_prefetch))
			
 
				 				break;
			
 
				 			if (try_to_reuse_potentially_in_use_mc(dst_node, handle, replicate, footprint, is_prefetch))
			
 
				 			{
			
@@ -1596,7 +1601,7 @@ out:
 
				 	return allocated_memory;
			
 
				 }
			
 
				 
			
 
				-int _starpu_allocate_memory_on_node(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, unsigned is_prefetch)
			
 
				+int _starpu_allocate_memory_on_node(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, enum _starpu_is_prefetch is_prefetch)
			
 
				 {
			
 
				 	starpu_ssize_t allocated_memory;
			
 
				 
			
--- a/src/datawizard/memalloc.h
+++ b/src/datawizard/memalloc.h
@@ -26,6 +26,7 @@
 
				 #include <datawizard/interfaces/data_interface.h>
			
 
				 #include <datawizard/coherency.h>
			
 
				 #include <datawizard/copy_driver.h>
			
 
				+#include <datawizard/data_request.h>
			
 
				 
			
 
				 struct _starpu_data_replicate;
			
 
				 
			
@@ -59,10 +60,17 @@ LIST_TYPE(_starpu_mem_chunk,
 
				 	/** Whether the memchunk is in the clean part of the mc_list */
			
 
				 	unsigned clean:1;
			
 
				 	/** Was this chunk used since it got allocated?  */
			
 
				+	/* FIXME: probably useless now with nb_tasks_prefetch */
			
 
				 	unsigned diduse:1;
			
 
				 	/** Was this chunk marked as "won't use"? */
			
 
				 	unsigned wontuse:1;
			
 
				 
			
 
				+	/** The number of prefetches that we made for this mc for various tasks
			
 
				+	 * This is also the number of tasks that we will wait to see use this mc before
			
 
				+	 * we attempt to evict it.
			
 
				+	 */
			
 
				+	unsigned nb_tasks_prefetch;
			
 
				+
			
 
				 	/** the size of the data is only set when calling _starpu_request_mem_chunk_removal(),
			
 
				 	 * it is needed to estimate how much memory is in mc_cache, and by
			
 
				 	 * free_memory_on_node() which is called when the handle is no longer
			
@@ -84,7 +92,7 @@ void _starpu_init_mem_chunk_lists(void);
 
				 void _starpu_deinit_mem_chunk_lists(void);
			
 
				 void _starpu_mem_chunk_init_last(void);
			
 
				 void _starpu_request_mem_chunk_removal(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, unsigned node, size_t size);
			
 
				-int _starpu_allocate_memory_on_node(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, unsigned is_prefetch);
			
 
				+int _starpu_allocate_memory_on_node(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, enum _starpu_is_prefetch is_prefetch);
			
 
				 size_t _starpu_free_all_automatically_allocated_buffers(unsigned node);
			
 
				 void _starpu_memchunk_recently_used(struct _starpu_mem_chunk *mc, unsigned node);
			
 
				 void _starpu_memchunk_wont_use(struct _starpu_mem_chunk *m, unsigned nodec);
			
--- a/src/datawizard/memory_manager.c
+++ b/src/datawizard/memory_manager.c
@@ -19,6 +19,7 @@
 
				 #include <common/thread.h>
			
 
				 #include <common/fxt.h>
			
 
				 #include <datawizard/memory_manager.h>
			
 
				+#include <datawizard/memory_nodes.h>
			
 
				 #include <core/workers.h>
			
 
				 #include <starpu_stdlib.h>
			
 
				 
			
--- a/src/datawizard/reduction.c
+++ b/src/datawizard/reduction.c
@@ -22,6 +22,7 @@
 
				 #include <datawizard/datawizard.h>
			
 
				 #include <drivers/mic/driver_mic_source.h>
			
 
				 #include <drivers/mp_common/source_common.h>
			
 
				+#include <datawizard/memory_nodes.h>
			
 
				 
			
 
				 void starpu_data_set_reduction_methods(starpu_data_handle_t handle,
			
 
				 				       struct starpu_codelet *redux_cl,
			
--- a/src/datawizard/user_interactions.c
+++ b/src/datawizard/user_interactions.c
@@ -22,6 +22,7 @@
 
				 #include <datawizard/write_back.h>
			
 
				 #include <core/dependencies/data_concurrency.h>
			
 
				 #include <core/sched_policy.h>
			
 
				+#include <datawizard/memory_nodes.h>
			
 
				 
			
 
				 static void _starpu_data_check_initialized(starpu_data_handle_t handle, enum starpu_data_access_mode mode)
			
 
				 {
			
@@ -46,7 +47,7 @@ int starpu_data_request_allocation(starpu_data_handle_t handle, unsigned node)
 
				 
			
 
				 	_starpu_spin_lock(&handle->header_lock);
			
 
				 
			
 
				-	r = _starpu_create_data_request(handle, NULL, &handle->per_node[node], node, STARPU_NONE, 0, 1, 0, 0, "starpu_data_request_allocation");
			
 
				+	r = _starpu_create_data_request(handle, NULL, &handle->per_node[node], node, STARPU_NONE, 0, STARPU_PREFETCH, 0, 0, "starpu_data_request_allocation");
			
 
				 
			
 
				 	/* we do not increase the refcnt associated to the request since we are
			
 
				 	 * not waiting for its termination */
			
@@ -67,7 +68,7 @@ struct user_interaction_wrapper
 
				 	starpu_pthread_mutex_t lock;
			
 
				 	unsigned finished;
			
 
				 	unsigned detached;
			
 
				-	unsigned prefetch;
			
 
				+	enum _starpu_is_prefetch prefetch;
			
 
				 	unsigned async;
			
 
				 	int prio;
			
 
				 	void (*callback)(void *);
			
@@ -535,7 +536,7 @@ static void _prefetch_data_on_node(void *arg)
 
				 }
			
 
				 
			
 
				 static
			
 
				-int _starpu_prefetch_data_on_node_with_mode(starpu_data_handle_t handle, unsigned node, unsigned async, enum starpu_data_access_mode mode, unsigned prefetch, int prio)
			
 
				+int _starpu_prefetch_data_on_node_with_mode(starpu_data_handle_t handle, unsigned node, unsigned async, enum starpu_data_access_mode mode, enum _starpu_is_prefetch prefetch, int prio)
			
 
				 {
			
 
				 	STARPU_ASSERT(handle);
			
 
				 
			
@@ -595,12 +596,12 @@ int _starpu_prefetch_data_on_node_with_mode(starpu_data_handle_t handle, unsigne
 
				 
			
 
				 int starpu_data_fetch_on_node(starpu_data_handle_t handle, unsigned node, unsigned async)
			
 
				 {
			
 
				-	return _starpu_prefetch_data_on_node_with_mode(handle, node, async, STARPU_R, 0, 0);
			
 
				+	return _starpu_prefetch_data_on_node_with_mode(handle, node, async, STARPU_R, STARPU_FETCH, 0);
			
 
				 }
			
 
				 
			
 
				 int starpu_data_prefetch_on_node_prio(starpu_data_handle_t handle, unsigned node, unsigned async, int prio)
			
 
				 {
			
 
				-	return _starpu_prefetch_data_on_node_with_mode(handle, node, async, STARPU_R, 1, prio);
			
 
				+	return _starpu_prefetch_data_on_node_with_mode(handle, node, async, STARPU_R, STARPU_PREFETCH, prio);
			
 
				 }
			
 
				 
			
 
				 int starpu_data_prefetch_on_node(starpu_data_handle_t handle, unsigned node, unsigned async)
			
@@ -610,7 +611,7 @@ int starpu_data_prefetch_on_node(starpu_data_handle_t handle, unsigned node, uns
 
				 
			
 
				 int starpu_data_idle_prefetch_on_node_prio(starpu_data_handle_t handle, unsigned node, unsigned async, int prio)
			
 
				 {
			
 
				-	return _starpu_prefetch_data_on_node_with_mode(handle, node, async, STARPU_R, 2, prio);
			
 
				+	return _starpu_prefetch_data_on_node_with_mode(handle, node, async, STARPU_R, STARPU_IDLEFETCH, prio);
			
 
				 }
			
 
				 
			
 
				 int starpu_data_idle_prefetch_on_node(starpu_data_handle_t handle, unsigned node, unsigned async)
			
--- a/src/datawizard/write_back.c
+++ b/src/datawizard/write_back.c
@@ -17,6 +17,7 @@
 
				 #include <datawizard/datawizard.h>
			
 
				 #include <datawizard/write_back.h>
			
 
				 #include <core/dependencies/data_concurrency.h>
			
 
				+#include <datawizard/memory_nodes.h>
			
 
				 
			
 
				 static void wt_callback(void *arg)
			
 
				 {
			
@@ -63,7 +64,7 @@ void _starpu_write_through_data(starpu_data_handle_t handle, unsigned requesting
 
				 
			
 
				 				struct _starpu_data_request *r;
			
 
				 				r = _starpu_create_request_to_fetch_data(handle, &handle->per_node[node],
			
 
				-									 STARPU_R, 1, 1, wt_callback, handle, 0, "_starpu_write_through_data");
			
 
				+									 STARPU_R, STARPU_IDLEFETCH, 1, wt_callback, handle, 0, "_starpu_write_through_data");
			
 
				 
			
 
				 			        /* If no request was created, the handle was already up-to-date on the
			
 
				 			         * node */
			
--- a/src/debug/latency.c
+++ b/src/debug/latency.c
@@ -34,7 +34,7 @@ void _starpu_benchmark_ping_pong(starpu_data_handle_t handle,
 
				 		_starpu_spin_unlock(&handle->header_lock);
			
 
				 
			
 
				 		struct _starpu_data_replicate *replicate_0 = &handle->per_node[node0];
			
 
				-		ret = _starpu_fetch_data_on_node(handle, node0, replicate_0, STARPU_RW, 0, 0, 0, NULL, NULL, 0, "_starpu_benchmark_ping_pong");
			
 
				+		ret = _starpu_fetch_data_on_node(handle, node0, replicate_0, STARPU_RW, 0, STARPU_FETCH, 0, NULL, NULL, 0, "_starpu_benchmark_ping_pong");
			
 
				 		STARPU_ASSERT(!ret);
			
 
				 		_starpu_release_data_on_node(handle, node0, replicate_0);
			
 
				 
			
@@ -44,7 +44,7 @@ void _starpu_benchmark_ping_pong(starpu_data_handle_t handle,
 
				 		_starpu_spin_unlock(&handle->header_lock);
			
 
				 
			
 
				 		struct _starpu_data_replicate *replicate_1 = &handle->per_node[node1];
			
 
				-		ret = _starpu_fetch_data_on_node(handle, node1, replicate_1, STARPU_RW, 0, 0, 0, NULL, NULL, 0, "_starpu_benchmark_ping_pong");
			
 
				+		ret = _starpu_fetch_data_on_node(handle, node1, replicate_1, STARPU_RW, 0, STARPU_FETCH, 0, NULL, NULL, 0, "_starpu_benchmark_ping_pong");
			
 
				 		STARPU_ASSERT(!ret);
			
 
				 		_starpu_release_data_on_node(handle, node1, replicate_1);
			
 
				 	}
			
--- a/src/debug/traces/starpu_fxt.c
+++ b/src/debug/traces/starpu_fxt.c
@@ -20,6 +20,7 @@
 
				 #include <starpu.h>
			
 
				 #include <common/config.h>
			
 
				 #include <common/uthash.h>
			
 
				+#include <datawizard/copy_driver.h>
			
 
				 #include <string.h>
			
 
				 
			
 
				 #ifdef STARPU_HAVE_POTI
			
@@ -1194,8 +1195,8 @@ static void handle_new_mem_node(struct fxt_ev_64 *ev, struct starpu_fxt_options
 
				  */
			
 
				 static int create_ordered_stream_id (int nodeid, int devid)
			
 
				 {
			
 
				-	static int stable[MAX_MPI_NODES][STARPU_MAXCUDADEVS];
			
 
				-	STARPU_ASSERT(nodeid < MAX_MPI_NODES);
			
 
				+	static int stable[STARPU_FXT_MAX_FILES][STARPU_MAXCUDADEVS];
			
 
				+	STARPU_ASSERT(nodeid < STARPU_FXT_MAX_FILES);
			
 
				 	STARPU_ASSERT(devid < STARPU_MAXCUDADEVS);
			
 
				 	return stable[nodeid][devid]++;
			
 
				 }
			
@@ -2268,13 +2269,14 @@ static void handle_mpi_data_set_tag(struct fxt_ev_64 *ev, struct starpu_fxt_opti
 
				 	data->mpi_tag = tag;
			
 
				 }
			
 
				 
			
 
				-static const char *copy_link_type(unsigned prefetch)
			
 
				+static const char *copy_link_type(enum _starpu_is_prefetch prefetch)
			
 
				 {
			
 
				 	switch (prefetch)
			
 
				 	{
			
 
				-		case 0: return "F";
			
 
				-		case 1: return "PF";
			
 
				-		case 2: return "IF";
			
 
				+		case STARPU_FETCH: return "F";
			
 
				+		case STARPU_TASK_PREFETCH: return "TF";
			
 
				+		case STARPU_PREFETCH: return "PF";
			
 
				+		case STARPU_IDLEFETCH: return "IF";
			
 
				 		default: STARPU_ASSERT(0);
			
 
				 	}
			
 
				 }
			
@@ -2285,7 +2287,7 @@ static void handle_start_driver_copy(struct fxt_ev_64 *ev, struct starpu_fxt_opt
 
				 	unsigned dst = ev->param[1];
			
 
				 	unsigned size = ev->param[2];
			
 
				 	unsigned comid = ev->param[3];
			
 
				-	unsigned prefetch = ev->param[4];
			
 
				+	enum _starpu_is_prefetch prefetch = ev->param[4];
			
 
				 	unsigned long handle = ev->param[5];
			
 
				 	const char *link_type = copy_link_type(prefetch);
			
 
				 
			
@@ -2367,7 +2369,7 @@ static void handle_end_driver_copy(struct fxt_ev_64 *ev, struct starpu_fxt_optio
 
				 	unsigned dst = ev->param[1];
			
 
				 	unsigned long size = ev->param[2];
			
 
				 	unsigned comid = ev->param[3];
			
 
				-	unsigned prefetch = ev->param[4];
			
 
				+	enum _starpu_is_prefetch prefetch = ev->param[4];
			
 
				 	const char *link_type = copy_link_type(prefetch);
			
 
				 
			
 
				 	char *prefix = options->file_prefix;
			
--- a/src/debug/traces/starpu_fxt.h
+++ b/src/debug/traces/starpu_fxt.h
@@ -41,8 +41,6 @@
 
				 #include <starpu.h>
			
 
				 #include "../../../include/starpu_fxt.h"
			
 
				 
			
 
				-#define MAX_MPI_NODES 64
			
 
				-
			
 
				 extern char _starpu_last_codelet_symbol[STARPU_NMAXWORKERS][(FXT_MAX_PARAMS-5)*sizeof(unsigned long)];
			
 
				 
			
 
				 void _starpu_fxt_dag_init(char *dag_filename);
			
--- a/src/debug/traces/starpu_fxt_mpi.c
+++ b/src/debug/traces/starpu_fxt_mpi.c
@@ -103,27 +103,27 @@ int _starpu_fxt_mpi_find_sync_point(char *filename_in, uint64_t *offset, int *ke
 
				  */
			
 
				 
			
 
				 /* the list of MPI transfers found in the different traces */
			
 
				-static struct mpi_transfer *mpi_sends[MAX_MPI_NODES] = {NULL};
			
 
				-static struct mpi_transfer *mpi_recvs[MAX_MPI_NODES] = {NULL};
			
 
				+static struct mpi_transfer *mpi_sends[STARPU_FXT_MAX_FILES] = {NULL};
			
 
				+static struct mpi_transfer *mpi_recvs[STARPU_FXT_MAX_FILES] = {NULL};
			
 
				 
			
 
				 /* number of available slots in the lists  */
			
 
				-unsigned mpi_sends_list_size[MAX_MPI_NODES] = {0};
			
 
				-unsigned mpi_recvs_list_size[MAX_MPI_NODES] = {0};
			
 
				+unsigned mpi_sends_list_size[STARPU_FXT_MAX_FILES] = {0};
			
 
				+unsigned mpi_recvs_list_size[STARPU_FXT_MAX_FILES] = {0};
			
 
				 
			
 
				 /* number of slots actually used in the list  */
			
 
				-unsigned mpi_sends_used[MAX_MPI_NODES] = {0};
			
 
				-unsigned mpi_recvs_used[MAX_MPI_NODES] = {0};
			
 
				+unsigned mpi_sends_used[STARPU_FXT_MAX_FILES] = {0};
			
 
				+unsigned mpi_recvs_used[STARPU_FXT_MAX_FILES] = {0};
			
 
				 
			
 
				 /* number of slots already matched at the beginning of the list. This permits
			
 
				  * going through the lists from the beginning to match each and every
			
 
				  * transfer, thus avoiding a quadratic complexity. */
			
 
				-unsigned mpi_recvs_matched[MAX_MPI_NODES][MAX_MPI_NODES] = { {0} };
			
 
				-unsigned mpi_sends_matched[MAX_MPI_NODES][MAX_MPI_NODES] = { {0} };
			
 
				+unsigned mpi_recvs_matched[STARPU_FXT_MAX_FILES][STARPU_FXT_MAX_FILES] = { {0} };
			
 
				+unsigned mpi_sends_matched[STARPU_FXT_MAX_FILES][STARPU_FXT_MAX_FILES] = { {0} };
			
 
				 
			
 
				 void _starpu_fxt_mpi_add_send_transfer(int src, int dst STARPU_ATTRIBUTE_UNUSED, long mpi_tag, size_t size, float date, long jobid, unsigned long handle)
			
 
				 {
			
 
				 	STARPU_ASSERT(src >= 0);
			
 
				-	if (src >= MAX_MPI_NODES)
			
 
				+	if (src >= STARPU_FXT_MAX_FILES)
			
 
				 		return;
			
 
				 	unsigned slot = mpi_sends_used[src]++;
			
 
				 
			
@@ -153,7 +153,7 @@ void _starpu_fxt_mpi_add_send_transfer(int src, int dst STARPU_ATTRIBUTE_UNUSED,
 
				 
			
 
				 void _starpu_fxt_mpi_add_recv_transfer(int src STARPU_ATTRIBUTE_UNUSED, int dst, long mpi_tag, float date, long jobid, unsigned long handle)
			
 
				 {
			
 
				-	if (dst >= MAX_MPI_NODES)
			
 
				+	if (dst >= STARPU_FXT_MAX_FILES)
			
 
				 		return;
			
 
				 	unsigned slot = mpi_recvs_used[dst]++;
			
 
				 
			
@@ -220,11 +220,11 @@ static unsigned long mpi_com_id = 0;
 
				 
			
 
				 static void display_all_transfers_from_trace(FILE *out_paje_file, FILE *out_comms_file, unsigned n)
			
 
				 {
			
 
				-	unsigned slot[MAX_MPI_NODES] = { 0 }, node;
			
 
				+	unsigned slot[STARPU_FXT_MAX_FILES] = { 0 }, node;
			
 
				 	unsigned nb_wrong_comm_timing = 0;
			
 
				 	struct mpi_transfer_list pending_receives; /* Sorted list of matches which have not happened yet */
			
 
				-	double current_out_bandwidth[MAX_MPI_NODES] = { 0. };
			
 
				-	double current_in_bandwidth[MAX_MPI_NODES] = { 0. };
			
 
				+	double current_out_bandwidth[STARPU_FXT_MAX_FILES] = { 0. };
			
 
				+	double current_in_bandwidth[STARPU_FXT_MAX_FILES] = { 0. };
			
 
				 #ifdef STARPU_HAVE_POTI
			
 
				 	char mpi_container[STARPU_POTI_STR_LEN];
			
 
				 #endif
			
@@ -246,7 +246,7 @@ static void display_all_transfers_from_trace(FILE *out_paje_file, FILE *out_comm
 
				 		else
			
 
				 			start_date = mpi_transfer_list_front(&pending_receives)->date;
			
 
				 
			
 
				-		src = MAX_MPI_NODES;
			
 
				+		src = STARPU_FXT_MAX_FILES;
			
 
				 		for (node = 0; node < n; node++)
			
 
				 		{
			
 
				 			if (slot[node] < mpi_sends_used[node] && mpi_sends[node][slot[node]].date < start_date)
			
@@ -260,7 +260,7 @@ static void display_all_transfers_from_trace(FILE *out_paje_file, FILE *out_comm
 
				 			/* No event any more, we're finished! */
			
 
				 			break;
			
 
				 
			
 
				-		if (src == MAX_MPI_NODES)
			
 
				+		if (src == STARPU_FXT_MAX_FILES)
			
 
				 		{
			
 
				 			/* Pending match is earlier than all new sends, finish its communication */
			
 
				 			match = mpi_transfer_list_pop_front(&pending_receives);
			
@@ -284,7 +284,7 @@ static void display_all_transfers_from_trace(FILE *out_paje_file, FILE *out_comm
 
				 		size_t size = cur->size;
			
 
				 		unsigned long send_handle = cur->handle;
			
 
				 
			
 
				-		if (dst < MAX_MPI_NODES)
			
 
				+		if (dst < STARPU_FXT_MAX_FILES)
			
 
				 			match = try_to_match_send_transfer(src, dst, mpi_tag);
			
 
				 		else
			
 
				 			match = NULL;
			
@@ -377,10 +377,10 @@ static void display_all_transfers_from_trace(FILE *out_paje_file, FILE *out_comm
 
				 
			
 
				 void _starpu_fxt_display_mpi_transfers(struct starpu_fxt_options *options, int *ranks STARPU_ATTRIBUTE_UNUSED, FILE *out_paje_file, FILE* out_comms_file)
			
 
				 {
			
 
				-	if (options->ninputfiles > MAX_MPI_NODES)
			
 
				+	if (options->ninputfiles > STARPU_FXT_MAX_FILES)
			
 
				 	{
			
 
				-		_STARPU_DISP("Warning: %u files given, maximum %u supported, truncating to %u\n", options->ninputfiles, MAX_MPI_NODES, MAX_MPI_NODES);
			
 
				-		options->ninputfiles = MAX_MPI_NODES;
			
 
				+		_STARPU_DISP("Warning: %u files given, maximum %u supported, truncating to %u\n", options->ninputfiles, STARPU_FXT_MAX_FILES, STARPU_FXT_MAX_FILES);
			
 
				+		options->ninputfiles = STARPU_FXT_MAX_FILES;
			
 
				 	}
			
 
				 
			
 
				 	/* display the MPI transfers if possible */
			
--- a/src/debug/traces/starpu_paje.c
+++ b/src/debug/traces/starpu_paje.c
@@ -398,6 +398,7 @@ void _starpu_fxt_write_paje_header(FILE *file STARPU_ATTRIBUTE_UNUSED, struct st
 
				 	/* Link types */
			
 
				 	poti_DefineLinkType("MPIL", "MPIP", "MPICt", "MPICt", "MPI communication");
			
 
				 	poti_DefineLinkType("F", "P", "Mm", "Mm", "Intra-node data Fetch");
			
 
				+	poti_DefineLinkType("TF", "P", "Mm", "Mm", "Intra-node data TaskPreFetch");
			
 
				 	poti_DefineLinkType("PF", "P", "Mm", "Mm", "Intra-node data PreFetch");
			
 
				 	poti_DefineLinkType("IF", "P", "Mm", "Mm", "Intra-node data IdleFetch");
			
 
				 	poti_DefineLinkType("WSL", "P", "W", "W", "Work steal");
			
@@ -551,6 +552,7 @@ void _starpu_fxt_write_paje_header(FILE *file STARPU_ATTRIBUTE_UNUSED, struct st
 
				 6       No       MS     Nothing         \".0 .0 .0\"		\n\
			
 
				 5       MPIL     MPIP	MPICt	MPICt   \"MPI communication\"\n\
			
 
				 5       F       P	Mm	Mm      \"Intra-node data Fetch\"\n\
			
 
				+5       TF      P	Mm	Mm      \"Intra-node data TaskPreFetch\"\n\
			
 
				 5       PF      P	Mm	Mm      \"Intra-node data PreFetch\"\n\
			
 
				 5       IF      P	Mm	Mm      \"Intra-node data IdleFetch\"\n\
			
 
				 5       WSL     P	W	W       \"Work steal\"\n");
			
--- a/src/drivers/cpu/driver_cpu.c
+++ b/src/drivers/cpu/driver_cpu.c
@@ -109,7 +109,10 @@ static int execute_job_on_cpu(struct _starpu_job *j, struct starpu_task *worker_
 
				 				_SIMGRID_TIMER_END;
			
 
				 			}
			
 
				 			else
			
 
				-				_starpu_simgrid_submit_job(cpu_args->workerid, j, perf_arch, NAN, NULL);
			
 
				+			{
			
 
				+				struct _starpu_sched_ctx *sched_ctx = _starpu_sched_ctx_get_sched_ctx_for_worker_and_job(cpu_args, j);
			
 
				+				_starpu_simgrid_submit_job(cpu_args->workerid, sched_ctx->id, j, perf_arch, NAN, NAN, NULL);
			
 
				+			}
			
 
				 #else
			
 
				 #  ifdef STARPU_PAPI
			
 
				 			_starpu_profiling_papi_task_start_counters(task);
			
--- a/src/drivers/cuda/driver_cuda.c
+++ b/src/drivers/cuda/driver_cuda.c
@@ -531,10 +531,13 @@ static int start_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *worke
 
				 				_SIMGRID_TIMER_END;
			
 
				 			}
			
 
				 		else
			
 
				-			_starpu_simgrid_submit_job(workerid, j, &worker->perf_arch, NAN,
			
 
				+		{
			
 
				+			struct _starpu_sched_ctx *sched_ctx = _starpu_sched_ctx_get_sched_ctx_for_worker_and_job(worker, j);
			
 
				+			_starpu_simgrid_submit_job(workerid, sched_ctx->id, j, &worker->perf_arch, NAN, NAN,
			
 
				 				async ? &task_finished[workerid][pipeline_idx] : NULL);
			
 
				+		}
			
 
				 #else
			
 
				-#ifdef HAVE_LIBNVIDIA_ML
			
 
				+#ifdef HAVE_NVMLDEVICEGETTOTALENERGYCONSUMPTION
			
 
				 		unsigned long long energy_start = 0;
			
 
				 		nvmlReturn_t nvmlRet = -1;
			
 
				 		if (profiling && task->profiling_info)
			
@@ -558,7 +561,7 @@ static void finish_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *wor
 
				 	int profiling = starpu_profiling_status_get();
			
 
				 
			
 
				 
			
 
				-#ifdef HAVE_LIBNVIDIA_ML
			
 
				+#ifdef HAVE_NVMLDEVICEGETTOTALENERGYCONSUMPTION
			
 
				 	if (profiling && j->task->profiling_info && j->task->profiling_info->energy_consumed)
			
 
				 	{
			
 
				 		unsigned long long energy_end;
			
@@ -880,27 +883,33 @@ int _starpu_cuda_driver_run_once(struct _starpu_worker_set *worker_set)
 
				 			_starpu_set_local_worker_key(worker);
			
 
				 			finish_job_on_cuda(_starpu_get_job_associated_to_task(task), worker);
			
 
				 			/* See next task if any */
			
 
				-			if (worker->ntasks && worker->current_tasks[worker->first_task] != worker->task_transferring)
			
 
				+			if (worker->ntasks)
			
 
				 			{
			
 
				-				task = worker->current_tasks[worker->first_task];
			
 
				-				j = _starpu_get_job_associated_to_task(task);
			
 
				-				if (task->cl->cuda_flags[j->nimpl] & STARPU_CUDA_ASYNC)
			
 
				+				if (worker->current_tasks[worker->first_task] != worker->task_transferring)
			
 
				 				{
			
 
				-					/* An asynchronous task, it was already
			
 
				-					 * queued, it's now running, record its start time.  */
			
 
				-					_starpu_driver_start_job(worker, j, &worker->perf_arch, 0, starpu_profiling_status_get());
			
 
				+					task = worker->current_tasks[worker->first_task];
			
 
				+					j = _starpu_get_job_associated_to_task(task);
			
 
				+					if (task->cl->cuda_flags[j->nimpl] & STARPU_CUDA_ASYNC)
			
 
				+					{
			
 
				+						/* An asynchronous task, it was already
			
 
				+						 * queued, it's now running, record its start time.  */
			
 
				+						_starpu_driver_start_job(worker, j, &worker->perf_arch, 0, starpu_profiling_status_get());
			
 
				+					}
			
 
				+					else
			
 
				+					{
			
 
				+						/* A synchronous task, we have finished
			
 
				+						 * flushing the pipeline, we can now at
			
 
				+						 * last execute it.  */
			
 
				+
			
 
				+						_STARPU_TRACE_EVENT("sync_task");
			
 
				+						execute_job_on_cuda(task, worker);
			
 
				+						_STARPU_TRACE_EVENT("end_sync_task");
			
 
				+						worker->pipeline_stuck = 0;
			
 
				+					}
			
 
				 				}
			
 
				 				else
			
 
				-				{
			
 
				-					/* A synchronous task, we have finished
			
 
				-					 * flushing the pipeline, we can now at
			
 
				-					 * last execute it.  */
			
 
				-
			
 
				-					_STARPU_TRACE_EVENT("sync_task");
			
 
				-					execute_job_on_cuda(task, worker);
			
 
				-					_STARPU_TRACE_EVENT("end_sync_task");
			
 
				-					worker->pipeline_stuck = 0;
			
 
				-				}
			
 
				+					/* Data for next task didn't have time to finish transferring :/ */
			
 
				+					_STARPU_TRACE_WORKER_START_FETCH_INPUT(NULL, workerid);
			
 
				 			}
			
 
				 #ifdef STARPU_USE_FXT
			
 
				 			int k;
			
--- a/src/drivers/disk/driver_disk.c
+++ b/src/drivers/disk/driver_disk.c
@@ -21,6 +21,7 @@
 
				 #include <drivers/disk/driver_disk.h>
			
 
				 #include <drivers/cpu/driver_cpu.h>
			
 
				 #include <datawizard/coherency.h>
			
 
				+#include <datawizard/memory_nodes.h>
			
 
				 
			
 
				 int _starpu_disk_copy_src_to_disk(void * src, unsigned src_node, void * dst, size_t dst_offset, unsigned dst_node, size_t size, void * async_channel)
			
 
				 {
			
--- a/src/drivers/driver_common/driver_common.c
+++ b/src/drivers/driver_common/driver_common.c