5 år sedan · 8d264c88b3
--- a/ChangeLog
+++ b/ChangeLog
@@ -39,6 +39,8 @@ New features:
 
				     its OS index.
			
 
				   * New function starpu_get_hwloc_topology() to get a copy of the hwloc
			
 
				     topology used by StarPU.
			
 
				+  * Add a task prefetch level, to improve retaining data in accelerators so we
			
 
				+    can make prefetch more aggressive.
			
 
				 
			
 
				 Small changes:
			
 
				   * Use the S4U interface of Simgrid instead of xbt and MSG.
			
--- a/configure.ac
+++ b/configure.ac
@@ -1382,6 +1382,9 @@ if test x$enable_cuda = xyes; then
 
				 	    ]
			
 
				 	)
			
 
				 	if test x$have_valid_nvml = xyes ; then
			
 
				+		AC_CHECK_DECLS([nvmlDeviceGetTotalEnergyConsumption], [
			
 
				+			AC_CHECK_FUNCS([nvmlDeviceGetTotalEnergyConsumption])
			
 
				+			], [], [[#include <nvml.h>]])
			
 
				 		AC_DEFINE([HAVE_LIBNVIDIA_ML], [1], [Define to 1 if you have the nvidia-ml library])
			
 
				 		STARPU_CUDA_LDFLAGS="$STARPU_CUDA_LDFLAGS -lnvidia-ml"
			
 
				 	fi
			
--- a/doc/doxygen/chapters/210_check_list_performance.doxy
+++ b/doc/doxygen/chapters/210_check_list_performance.doxy
@@ -91,6 +91,8 @@ operations to avoid this issue. For instance:
 
				 
			
 
				 \code{.c}
			
 
				 func <<<grid,block,0,starpu_cuda_get_local_stream()>>> (foo, bar);
			
 
				+cudaError_t status = cudaGetLastError();
			
 
				+if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
			
 
				 cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 \endcode
			
 
				 
			
--- a/doc/doxygen/chapters/code/vector_scal_cuda.c
+++ b/doc/doxygen/chapters/code/vector_scal_cuda.c
@@ -35,6 +35,8 @@ extern "C" void scal_cuda_func(void *buffers[], void *_args)
 
				         unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
			
 
				 
			
 
				         vector_mult_cuda<<<nblocks,threads_per_block, 0, starpu_cuda_get_local_stream()>>>(n, val, *factor);
			
 
				+        cudaError_t status = cudaGetLastError();
			
 
				+        if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
			
 
				 
			
 
				         cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 }
			
--- a/doc/tutorial/vector_scal_cuda.cu
+++ b/doc/tutorial/vector_scal_cuda.cu
@@ -35,6 +35,8 @@ extern "C" void vector_scal_cuda(void *buffers[], void *_args)
 
				         unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
			
 
				 
			
 
				         vector_mult_cuda<<<nblocks,threads_per_block, 0, starpu_cuda_get_local_stream()>>>(val, n, *factor);
			
 
				+        cudaError_t status = cudaGetLastError();
			
 
				+        if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
			
 
				 
			
 
				         cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 }
			
--- a/examples/basic_examples/block_cuda.cu
+++ b/examples/basic_examples/block_cuda.cu
@@ -40,5 +40,7 @@ extern "C" void cuda_codelet(void *descr[], void *_args)
 
				         float *multiplier = (float *)_args;
			
 
				 
			
 
				         cuda_block<<<1,1, 0, starpu_cuda_get_local_stream()>>>(block, nx, ny, nz, ldy, ldz, *multiplier);
			
 
				+        cudaError_t status = cudaGetLastError();
			
 
				+        if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
			
 
				 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 }
			
--- a/examples/basic_examples/multiformat_conversion_codelets_cuda.cu
+++ b/examples/basic_examples/multiformat_conversion_codelets_cuda.cu
@@ -44,4 +44,6 @@ extern "C" void cpu_to_cuda_cuda_func(void *buffers[], void *_args)
 
				 	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
			
 
				 
			
 
				         cpu_to_cuda_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>>(src, dst, n);
			
 
				+        cudaError_t status = cudaGetLastError();
			
 
				+        if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
			
 
				 }
			
--- a/examples/basic_examples/multiformat_cuda.cu
+++ b/examples/basic_examples/multiformat_cuda.cu
@@ -39,6 +39,8 @@ extern "C" void multiformat_scal_cuda_func(void *buffers[], void *_args)
 
				 	unsigned threads_per_block = 64;
			
 
				 	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
			
 
				         multiformat_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>>(soa, n);
			
 
				+        cudaError_t status = cudaGetLastError();
			
 
				+        if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
			
 
				 
			
 
				 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 }
			
--- a/examples/basic_examples/variable_kernels.cu
+++ b/examples/basic_examples/variable_kernels.cu
@@ -27,5 +27,7 @@ extern "C" void cuda_codelet(void *descr[], void *_args)
 
				 	float *val = (float *)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				 
			
 
				 	cuda_variable<<<1,1, 0, starpu_cuda_get_local_stream()>>>(val);
			
 
				+	cudaError_t status = cudaGetLastError();
			
 
				+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
			
 
				 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 }
			
--- a/examples/basic_examples/vector_scal_cuda.cu
+++ b/examples/basic_examples/vector_scal_cuda.cu
@@ -41,4 +41,6 @@ extern "C" void scal_cuda_func(void *buffers[], void *_args)
 
				 	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
			
 
				 
			
 
				         vector_mult_cuda<<<nblocks,threads_per_block,0,starpu_cuda_get_local_stream()>>>(n, val, *factor);
			
 
				+        cudaError_t status = cudaGetLastError();
			
 
				+        if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
			
 
				 }
			
--- a/examples/filters/custom_mf/conversion.cu
+++ b/examples/filters/custom_mf/conversion.cu
@@ -45,4 +45,6 @@ extern "C" void cpu_to_cuda_cuda_func(void *buffers[], void *_args)
 
				 	unsigned threads_per_block = 64;
			
 
				 	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
			
 
				         custom_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>>(aop, n, x, y);
			
 
				+        cudaError_t status = cudaGetLastError();
			
 
				+        if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
			
 
				 }
			
--- a/examples/filters/custom_mf/cuda.cu
+++ b/examples/filters/custom_mf/cuda.cu
@@ -39,4 +39,6 @@ extern "C" void custom_scal_cuda_func(void *buffers[], void *_args)
 
				 	unsigned threads_per_block = 64;
			
 
				 	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
			
 
				         scal_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>>(n, x, y);
			
 
				+        cudaError_t status = cudaGetLastError();
			
 
				+        if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
			
 
				 }
			
--- a/examples/filters/fblock_cuda.cu
+++ b/examples/filters/fblock_cuda.cu
@@ -43,4 +43,6 @@ extern "C" void cuda_func(void *buffers[], void *_args)
 
				 
			
 
				         /* TODO: use more blocks and threads in blocks */
			
 
				         fblock_cuda<<<1,1, 0, starpu_cuda_get_local_stream()>>>(block, nx, ny, nz, ldy, ldz, *factor);
			
 
				+        cudaError_t status = cudaGetLastError();
			
 
				+        if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
			
 
				 }
			
--- a/examples/filters/fmultiple_cuda.cu
+++ b/examples/filters/fmultiple_cuda.cu
@@ -44,6 +44,8 @@ extern "C" void fmultiple_check_scale_cuda(void *buffers[], void *cl_arg)
 
				 
			
 
				         /* TODO: use more vals and threads in vals */
			
 
				 	_fmultiple_check_scale_cuda<<<1,1, 0, starpu_cuda_get_local_stream()>>>(val, nx, ny, ld, start, factor);
			
 
				+	cudaError_t status = cudaGetLastError();
			
 
				+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
			
 
				 }
			
 
				 
			
 
				 static __global__ void _fmultiple_check_cuda(int *val, int nx, int ny, unsigned ld, int start, int factor)
			
@@ -71,4 +73,6 @@ extern "C" void fmultiple_check_cuda(void *buffers[], void *cl_arg)
 
				 
			
 
				         /* TODO: use more vals and threads in vals */
			
 
				 	_fmultiple_check_cuda<<<1,1, 0, starpu_cuda_get_local_stream()>>>(val, nx, ny, ld, start, factor);
			
 
				+	cudaError_t status = cudaGetLastError();
			
 
				+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
			
 
				 }
			
--- a/examples/incrementer/incrementer_kernels.cu
+++ b/examples/incrementer/incrementer_kernels.cu
@@ -32,4 +32,6 @@ extern "C" void cuda_codelet(void *descr[], void *_args)
 
				 	float *val = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				 
			
 
				 	cuda_incrementer<<<1,1, 0, starpu_cuda_get_local_stream()>>>(val);
			
 
				+	cudaError_t status = cudaGetLastError();
			
 
				+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
			
 
				 }
			
--- a/examples/interface/complex_kernels.cu
+++ b/examples/interface/complex_kernels.cu
@@ -44,4 +44,6 @@ extern "C" void copy_complex_codelet_cuda(void *descr[], void *_args)
 
				 	unsigned nblocks = (nx + threads_per_block-1) / threads_per_block;
			
 
				 
			
 
				         complex_copy_cuda<<<nblocks, threads_per_block, 0, starpu_cuda_get_local_stream()>>>(o_real, o_imaginary, i_real, i_imaginary, nx);
			
 
				+        cudaError_t status = cudaGetLastError();
			
 
				+        if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
			
 
				 }
			
--- a/examples/pi/SobolQRNG/sobol_gpu.cu
+++ b/examples/pi/SobolQRNG/sobol_gpu.cu
@@ -165,4 +165,6 @@ extern "C" void sobolGPU(int n_vectors, int n_dimensions, unsigned int *d_direct
 
				 
			
 
				     // Execute GPU kernel
			
 
				     sobolGPU_kernel<<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>>(n_vectors, n_dimensions, d_directions, d_output);
			
 
				+    cudaError_t status = cudaGetLastError();
			
 
				+    if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
			
 
				 }
			
--- a/examples/pi/pi_kernel.cu
+++ b/examples/pi/pi_kernel.cu
@@ -137,12 +137,16 @@ extern "C" void cuda_kernel(void *descr[], void *cl_arg)
 
				 	/* each entry of per_block_cnt contains the number of successful shots
			
 
				 	 * in the corresponding block. */
			
 
				 	monte_carlo<<<nblocks, nthread_per_block, 0, starpu_cuda_get_local_stream()>>>(random_numbers_x, random_numbers_y, nx, per_block_cnt);
			
 
				+	cures = cudaGetLastError();
			
 
				+	if (cures != cudaSuccess) STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 
			
 
				 	/* Note that we do not synchronize between kernel calls because there is an implicit serialization */
			
 
				 
			
 
				 	/* compute the total number of successful shots by adding the elements
			
 
				 	 * of the per_block_cnt array */
			
 
				 	sum_per_block_cnt<<<1, nblocks, 0, starpu_cuda_get_local_stream()>>>(per_block_cnt, cnt);
			
 
				+	cures = cudaGetLastError();
			
 
				+	if (cures != cudaSuccess) STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 	cures = cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 	if (cures)
			
 
				 		STARPU_CUDA_REPORT_ERROR(cures);
			
--- a/examples/pi/pi_redux_kernel.cu
+++ b/examples/pi/pi_redux_kernel.cu
@@ -115,12 +115,16 @@ extern "C" void pi_redux_cuda_kernel(float *x, float *y, unsigned n, unsigned lo
 
				 	/* each entry of per_block_cnt contains the number of successful shots
			
 
				 	 * in the corresponding block. */
			
 
				 	monte_carlo<<<nblocks, nthread_per_block, 0, starpu_cuda_get_local_stream()>>>(x, y, n, per_block_cnt);
			
 
				+	cures = cudaGetLastError();
			
 
				+	if (cures != cudaSuccess) STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 
			
 
				 	/* Note that we do not synchronize between kernel calls because there is an implicit serialization */
			
 
				 
			
 
				 	/* compute the total number of successful shots by adding the elements
			
 
				 	 * of the per_block_cnt array */
			
 
				 	sum_per_block_cnt<<<1, nblocks, 0, starpu_cuda_get_local_stream()>>>(per_block_cnt, shot_cnt);
			
 
				+	cures = cudaGetLastError();
			
 
				+	if (cures != cudaSuccess) STARPU_CUDA_REPORT_ERROR(cures);
			
 
				 	cures = cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 	if (cures)
			
 
				 		STARPU_CUDA_REPORT_ERROR(cures);
			
--- a/examples/reductions/dot_product_kernels.cu
+++ b/examples/reductions/dot_product_kernels.cu
@@ -33,4 +33,6 @@ extern "C" void redux_cuda_func(void *descr[], void *_args)
 
				 	DOT_TYPE *dotb = (DOT_TYPE *)STARPU_VARIABLE_GET_PTR(descr[1]);
			
 
				 
			
 
				 	cuda_redux<<<1,1, 0, starpu_cuda_get_local_stream()>>>(dota, dotb);
			
 
				+	cudaError_t status = cudaGetLastError();
			
 
				+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
			
 
				 }
			
--- a/examples/sched_ctx/axpy_partition_gpu.cu
+++ b/examples/sched_ctx/axpy_partition_gpu.cu
@@ -73,4 +73,6 @@ extern "C" void cuda_axpy(void *descr[], void *_args)
 
				 	__P_HOSTSETUP(saxpy_partitioned,dim3(dimensions,1,1),dimensions,0,SM_mapping_start,SM_allocation,stream);
			
 
				 
			
 
				   	saxpy_partitioned<<<width,dimensions,0,stream>>>(__P_HKARGS,n,a,x,y);
			
 
				+	cudaError_t status = cudaGetLastError();
			
 
				+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
			
 
				 }
			
--- a/examples/spmv/spmv_cuda.cu
+++ b/examples/spmv/spmv_cuda.cu
@@ -97,6 +97,8 @@ extern "C" void spmv_kernel_cuda(void *descr[], void *args)
 
				 
			
 
				 	spmv_kernel_3<<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>>
			
 
				 		(nnz, nrow, nzval, colind, rowptr, firstentry, vecin, nx_in, vecout, nx_out);
			
 
				+	cudaError_t status = cudaGetLastError();
			
 
				+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
			
 
				 }
			
 
				 
			
 
				 
			
--- a/examples/stencil/life_cuda.cu
+++ b/examples/stencil/life_cuda.cu
@@ -73,4 +73,6 @@ extern "C" void cuda_life_update_host(int bz, const TYPE *old, TYPE *newp, int n
 
				 	dim3 dimGrid((nx + threads_per_dim_x-1) / threads_per_dim_x, (ny + threads_per_dim_y-1) / threads_per_dim_y);
			
 
				 #endif
			
 
				 	cuda_life_update <<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>> (bz, old, newp, nx, ny, nz, ldy, ldz, iter);
			
 
				+	cudaError_t status = cudaGetLastError();
			
 
				+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
			
 
				 }
			
--- a/examples/stencil/shadow.cu
+++ b/examples/stencil/shadow.cu
@@ -53,4 +53,6 @@ extern "C" void cuda_shadow_host(int bz, TYPE *ptr, int nx, int ny, int nz, int
 
				 	dim3 dimGrid((nx + threads_per_dim_x-1) / threads_per_dim_x, (ny + threads_per_dim_y-1) / threads_per_dim_y);
			
 
				 #endif
			
 
				 	cuda_shadow <<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>> (bz, ptr, nx, ny, nz, ldy, ldz, i);
			
 
				+	cudaError_t status = cudaGetLastError();
			
 
				+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
			
 
				 }
			
--- a/julia/examples/mult/gpu_mult.cu
+++ b/julia/examples/mult/gpu_mult.cu
@@ -79,6 +79,8 @@ extern "C" void gpu_mult(void * descr[], void * args)
 
				 	gpuMultKernel
			
 
				 		<<< nblocks, THREADS_PER_BLOCK, 0, NULL /*starpu_cuda_get_local_stream()*/
			
 
				 		>>> (nxC, nyC, nyA, ldA, ldB, ldC, d_subA, d_subB, d_subC);
			
 
				+	cudaError_t status = cudaGetLastError();
			
 
				+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
			
 
				 
			
 
				 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 
			
--- a/julia/examples/old_examples/gpu_mult.cu
+++ b/julia/examples/old_examples/gpu_mult.cu
@@ -78,6 +78,8 @@ extern "C" void gpu_mult(void * descr[], void * args)
 
				 	gpuMultKernel
			
 
				 		<<< nblocks, THREADS_PER_BLOCK, 0, starpu_cuda_get_local_stream()
			
 
				 		>>> (nxC, nyC, nyA, ldA, ldB, ldC, d_subA, d_subB, d_subC);
			
 
				+	cudaError_t status = cudaGetLastError();
			
 
				+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
			
 
				 
			
 
				 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 
			
--- a/julia/examples/old_examples/mandelbrot/gpu_mandelbrot.cu
+++ b/julia/examples/old_examples/mandelbrot/gpu_mandelbrot.cu
@@ -106,6 +106,8 @@ extern "C" void gpu_mandelbrot(void *descr[], void *args)
 
				   nblocks = (nxP * nyP + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK;
			
 
				 
			
 
				   gpuMandelbrotKernel <<< nblocks, THREADS_PER_BLOCK, 0, starpu_cuda_get_local_stream() >>> (nxP, nyP, ldP, d_subP, *params);
			
 
				+  cudaError_t status = cudaGetLastError();
			
 
				+  if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
			
 
				 
			
 
				   cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 }
			
--- a/julia/examples/old_examples/mandelbrot/gpu_mandelbrot_between.cu
+++ b/julia/examples/old_examples/mandelbrot/gpu_mandelbrot_between.cu
@@ -123,6 +123,8 @@ extern "C" void CUDA_mandelbrot(void** buffers_uwrYFDVe, void* cl_arg_uwrYFDVe)
 
				              ptr_qoUGBRtY, local_height, conv_limit, ptr_A5zD9sJZ, 
			
 
				              ld_A5zD9sJZ);
			
 
				     ;
			
 
				+    cudaError_t status = cudaGetLastError();
			
 
				+    if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
			
 
				     cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 }
			
 
				 
			
--- a/julia/examples/old_examples/mult/gpu_mult.cu
+++ b/julia/examples/old_examples/mult/gpu_mult.cu
@@ -78,6 +78,8 @@ extern "C" void gpu_mult(void * descr[], void * args)
 
				 	gpuMultKernel
			
 
				 		<<< nblocks, THREADS_PER_BLOCK, 0, starpu_cuda_get_local_stream()
			
 
				 		>>> (nxC, nyC, nyA, ldA, ldB, ldC, d_subA, d_subB, d_subC);
			
 
				+	cudaError_t status = cudaGetLastError();
			
 
				+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
			
 
				 
			
 
				 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 
			
--- a/julia/examples/old_examples/nbody/gpu_nbody.cu
+++ b/julia/examples/old_examples/nbody/gpu_nbody.cu
@@ -94,6 +94,8 @@ extern "C" void gpu_nbody(void * descr[], void * args)
 
				   gpuNbodyKernel
			
 
				     <<< nblocks, THREADS_PER_BLOCK, 0, starpu_cuda_get_local_stream()
			
 
				     >>> (d_P,  d_subA, d_M, nxP, nxA, nxM, ldP, ldA, *params);
			
 
				+  cudaError_t status = cudaGetLastError();
			
 
				+  if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
			
 
				 
			
 
				   cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 
			
@@ -156,6 +158,8 @@ extern "C" void gpu_nbody2(void * descr[], void *args)
 
				   gpuNbody2Kernel
			
 
				     <<< nblocks, THREADS_PER_BLOCK, 0, starpu_cuda_get_local_stream()
			
 
				     >>> (d_subP, d_subV, d_subA, nxP, nxV, nxA, ldP, ldV, ldA, *params);
			
 
				+  cudaError_t status = cudaGetLastError();
			
 
				+  if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
			
 
				 
			
 
				   cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 }
			
--- a/julia/examples/old_examples/nbody/gpu_nbody_between.cu
+++ b/julia/examples/old_examples/nbody/gpu_nbody_between.cu
@@ -161,6 +161,8 @@ extern "C" void CUDA_nbody_updt(void** buffers_gj6UYWT4, void* cl_arg_gj6UYWT4)
 
				              ld_jJ5f8wMA, ptr_piPvdbTs, ld_piPvdbTs, ptr_JBaPgPiT, 
			
 
				              ptr_0STm2S4k, ld_0STm2S4k);
			
 
				     ;
			
 
				+    cudaError_t status = cudaGetLastError();
			
 
				+    if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
			
 
				     cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 }
			
 
				 
			
--- a/julia/src/compiler/expressions.jl
+++ b/julia/src/compiler/expressions.jl
@@ -335,6 +335,10 @@ function print(io :: IO, expr :: StarpuExprCudaCall ; indent = 0,restrict=false)
 
				 
			
 
				     print(io, ");")
			
 
				     print_newline(io, indent)
			
 
				+    print(io, "cudaError_t status = cudaGetLastError();")
			
 
				+    print_newline(io, indent)
			
 
				+    print(io, "if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);")
			
 
				+    print_newline(io, indent)
			
 
				 
			
 
				 end
			
 
				 
			
--- a/mpi/tests/ring_kernel.cu
+++ b/mpi/tests/ring_kernel.cu
@@ -27,5 +27,7 @@ extern "C" void increment_cuda(void *descr[], void *_args)
 
				 	int *tokenptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				 
			
 
				 	cuda_incrementer<<<1,1, 0, starpu_cuda_get_local_stream()>>>(tokenptr);
			
 
				+	cudaError_t status = cudaGetLastError();
			
 
				+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
			
 
				 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				 }
			
--- a/src/core/jobs.c
+++ b/src/core/jobs.c
@@ -485,6 +485,7 @@ void _starpu_handle_job_termination(struct _starpu_job *j)
 
				 		 * also the callback were executed. */
			
 
				 		j->terminated = 2;
			
 
				 	}
			
 
				+	task->prefetched = 0;
			
 
				 	STARPU_PTHREAD_COND_BROADCAST(&j->sync_cond);
			
 
				 	STARPU_AYU_REMOVETASK(j->job_id);
			
 
				 	STARPU_PTHREAD_MUTEX_UNLOCK(&j->sync_mutex);
			
--- a/src/core/sched_policy.c
+++ b/src/core/sched_policy.c
@@ -570,9 +570,6 @@ int _starpu_push_task_to_workers(struct starpu_task *task)
 
				 	int ret = 0;
			
 
				 	if (STARPU_UNLIKELY(task->execute_on_a_specific_worker))
			
 
				 	{
			
 
				-		if (starpu_get_prefetch_flag())
			
 
				-			starpu_prefetch_task_input_for(task, task->workerid);
			
 
				-
			
 
				 		ret = _starpu_push_task_on_specific_worker(task, task->workerid);
			
 
				 	}
			
 
				 	else
			
@@ -582,7 +579,8 @@ int _starpu_push_task_to_workers(struct starpu_task *task)
 
				 		/* When a task can only be executed on a given arch and we have
			
 
				 		 * only one memory node for that arch, we can systematically
			
 
				 		 * prefetch before the scheduling decision. */
			
 
				-		if (starpu_get_prefetch_flag() && starpu_memory_nodes_get_count() > 1)
			
 
				+		/* XXX: no, this is definitely not a good idea if the scheduler reorders tasks... */
			
 
				+		if (0 && starpu_get_prefetch_flag() && starpu_memory_nodes_get_count() > 1)
			
 
				 		{
			
 
				 			if (task->where == STARPU_CPU && config->cpus_nodeid >= 0)
			
 
				 				starpu_prefetch_task_input_on_node(task, config->cpus_nodeid);
			
--- a/src/datawizard/coherency.c
+++ b/src/datawizard/coherency.c
@@ -404,7 +404,7 @@ int _starpu_determine_request_path(starpu_data_handle_t handle,
 
				 /* handle->lock should be taken. r is returned locked. The node parameter
			
 
				  * indicate either the source of the request, or the destination for a
			
 
				  * write-only request. */
			
 
				-static struct _starpu_data_request *_starpu_search_existing_data_request(struct _starpu_data_replicate *replicate, unsigned node, enum starpu_data_access_mode mode, unsigned is_prefetch)
			
 
				+static struct _starpu_data_request *_starpu_search_existing_data_request(struct _starpu_data_replicate *replicate, unsigned node, enum starpu_data_access_mode mode, enum _starpu_is_prefetch is_prefetch)
			
 
				 {
			
 
				 	struct _starpu_data_request *r;
			
 
				 
			
@@ -467,7 +467,7 @@ static struct _starpu_data_request *_starpu_search_existing_data_request(struct
 
				 
			
 
				 struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_handle_t handle,
			
 
				 								  struct _starpu_data_replicate *dst_replicate,
			
 
				-								  enum starpu_data_access_mode mode, unsigned is_prefetch,
			
 
				+								  enum starpu_data_access_mode mode, enum _starpu_is_prefetch is_prefetch,
			
 
				 								  unsigned async,
			
 
				 								  void (*callback_func)(void *), void *callback_arg, int prio, const char *origin)
			
 
				 {
			
@@ -522,7 +522,13 @@ struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_ha
 
				 #endif
			
 
				 
			
 
				 			if (dst_replicate->mc)
			
 
				+			{
			
 
				+				if (is_prefetch == STARPU_TASK_PREFETCH)
			
 
				+					/* Make sure it stays there */
			
 
				+					dst_replicate->mc->nb_tasks_prefetch++;
			
 
				+
			
 
				 				_starpu_memchunk_recently_used(dst_replicate->mc, requesting_node);
			
 
				+			}
			
 
				 		}
			
 
				 
			
 
				 		_starpu_spin_unlock(&handle->header_lock);
			
@@ -567,6 +573,9 @@ struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_ha
 
				 			if (_starpu_allocate_memory_on_node(handle, dst_replicate, is_prefetch) == 0)
			
 
				 			{
			
 
				 				_starpu_update_data_state(handle, dst_replicate, mode);
			
 
				+				if (is_prefetch == STARPU_TASK_PREFETCH)
			
 
				+					/* Make sure it stays there */
			
 
				+					dst_replicate->mc->nb_tasks_prefetch++;
			
 
				 
			
 
				 				_starpu_spin_unlock(&handle->header_lock);
			
 
				 
			
@@ -645,9 +654,17 @@ struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_ha
 
				 				STARPU_ASSERT(r->next_req_count <= STARPU_MAXNODES);
			
 
				 			}
			
 
				 		}
			
 
				-		else if (!write_invalidation)
			
 
				-			/* The last request will perform the callback after termination */
			
 
				-			_starpu_data_request_append_callback(r, callback_func, callback_arg);
			
 
				+		else
			
 
				+		{
			
 
				+			if (is_prefetch == STARPU_TASK_PREFETCH)
			
 
				+				/* Make last request add the prefetch count on the mc to keep the data
			
 
				+				 * there until the task gets to execute.  */
			
 
				+				r->nb_tasks_prefetch++;
			
 
				+
			
 
				+			if (!write_invalidation)
			
 
				+				/* The last request will perform the callback after termination */
			
 
				+				_starpu_data_request_append_callback(r, callback_func, callback_arg);
			
 
				+		}
			
 
				 
			
 
				 
			
 
				 		if (reused_requests[hop])
			
@@ -712,7 +729,7 @@ struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_ha
 
				 }
			
 
				 
			
 
				 int _starpu_fetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *dst_replicate,
			
 
				-			       enum starpu_data_access_mode mode, unsigned detached, unsigned is_prefetch, unsigned async,
			
 
				+			       enum starpu_data_access_mode mode, unsigned detached, enum _starpu_is_prefetch is_prefetch, unsigned async,
			
 
				 			       void (*callback_func)(void *), void *callback_arg, int prio, const char *origin)
			
 
				 {
			
 
				         _STARPU_LOG_IN();
			
@@ -744,6 +761,7 @@ int _starpu_fetch_data_on_node(starpu_data_handle_t handle, int node, struct _st
 
				 		if (src_node_mask == 0)
			
 
				 		{
			
 
				 			/* no valid copy, nothing to prefetch */
			
 
				+			_STARPU_DISP("Warning: no valid copy to prefetch?! that's not supposed to happen, please report\n");
			
 
				 			_starpu_spin_unlock(&handle->header_lock);
			
 
				 			return 0;
			
 
				 		}
			
@@ -782,17 +800,22 @@ int _starpu_fetch_data_on_node(starpu_data_handle_t handle, int node, struct _st
 
				 
			
 
				 static int idle_prefetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate, enum starpu_data_access_mode mode, int prio)
			
 
				 {
			
 
				-	return _starpu_fetch_data_on_node(handle, node, replicate, mode, 1, 2, 1, NULL, NULL, prio, "idle_prefetch_data_on_node");
			
 
				+	return _starpu_fetch_data_on_node(handle, node, replicate, mode, 1, STARPU_IDLEFETCH, 1, NULL, NULL, prio, "idle_prefetch_data_on_node");
			
 
				 }
			
 
				 
			
 
				-static int prefetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate, enum starpu_data_access_mode mode, int prio)
			
 
				+static int task_prefetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate, enum starpu_data_access_mode mode, int prio)
			
 
				 {
			
 
				-	return _starpu_fetch_data_on_node(handle, node, replicate, mode, 1, 1, 1, NULL, NULL, prio, "prefetch_data_on_node");
			
 
				+	return _starpu_fetch_data_on_node(handle, node, replicate, mode, 1, STARPU_TASK_PREFETCH, 1, NULL, NULL, prio, "task_prefetch_data_on_node");
			
 
				+}
			
 
				+
			
 
				+static int STARPU_ATTRIBUTE_UNUSED prefetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate, enum starpu_data_access_mode mode, int prio)
			
 
				+{
			
 
				+	return _starpu_fetch_data_on_node(handle, node, replicate, mode, 1, STARPU_PREFETCH, 1, NULL, NULL, prio, "prefetch_data_on_node");
			
 
				 }
			
 
				 
			
 
				 static int fetch_data(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate, enum starpu_data_access_mode mode, int prio)
			
 
				 {
			
 
				-	return _starpu_fetch_data_on_node(handle, node, replicate, mode, 0, 0, 0, NULL, NULL, prio, "fetch_data");
			
 
				+	return _starpu_fetch_data_on_node(handle, node, replicate, mode, 0, STARPU_FETCH, 0, NULL, NULL, prio, "fetch_data");
			
 
				 }
			
 
				 
			
 
				 uint32_t _starpu_get_data_refcnt(starpu_data_handle_t handle, unsigned node)
			
@@ -911,10 +934,11 @@ int starpu_prefetch_task_input_on_node_prio(struct starpu_task *task, unsigned t
 
				 		int node = _starpu_task_data_get_node_on_node(task, index, target_node);
			
 
				 
			
 
				 		struct _starpu_data_replicate *replicate = &handle->per_node[node];
			
 
				-		prefetch_data_on_node(handle, node, replicate, mode, prio);
			
 
				+		task_prefetch_data_on_node(handle, node, replicate, mode, prio);
			
 
				 
			
 
				 		_starpu_set_data_requested_flag_if_needed(handle, replicate);
			
 
				 	}
			
 
				+	task->prefetched = 1;
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
@@ -988,10 +1012,11 @@ int starpu_prefetch_task_input_for_prio(struct starpu_task *task, unsigned worke
 
				 		int node = _starpu_task_data_get_node_on_worker(task, index, worker);
			
 
				 
			
 
				 		struct _starpu_data_replicate *replicate = &handle->per_node[node];
			
 
				-		prefetch_data_on_node(handle, node, replicate, mode, prio);
			
 
				+		task_prefetch_data_on_node(handle, node, replicate, mode, prio);
			
 
				 
			
 
				 		_starpu_set_data_requested_flag_if_needed(handle, replicate);
			
 
				 	}
			
 
				+	task->prefetched = 1;
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
@@ -1133,7 +1158,7 @@ int _starpu_fetch_task_input(struct starpu_task *task, struct _starpu_job *j, in
 
				 
			
 
				 		if (async)
			
 
				 		{
			
 
				-			ret = _starpu_fetch_data_on_node(handle, node, local_replicate, mode, 0, 0, 1,
			
 
				+			ret = _starpu_fetch_data_on_node(handle, node, local_replicate, mode, 0, STARPU_FETCH, 1,
			
 
				 					_starpu_fetch_task_input_cb, worker, 0, "_starpu_fetch_task_input");
			
 
				 #ifdef STARPU_SIMGRID
			
 
				 			if (_starpu_simgrid_fetching_input_cost())
			
@@ -1223,7 +1248,19 @@ void _starpu_fetch_task_input_tail(struct starpu_task *task, struct _starpu_job
 
				 		local_replicate = get_replicate(handle, mode, workerid, node);
			
 
				 		_starpu_spin_lock(&handle->header_lock);
			
 
				 		if (local_replicate->mc)
			
 
				+		{
			
 
				 			local_replicate->mc->diduse = 1;
			
 
				+			if (task->prefetched &&
			
 
				+				!(mode & (STARPU_SCRATCH|STARPU_REDUX)) &&
			
 
				+				(mode & STARPU_R))
			
 
				+			{
			
 
				+				/* Allocations or transfer prefetchs should have been done by now and marked
			
 
				+				 * this mc as needed for us.
			
 
				+				 * Now that we added a reference for the task, we can relieve that.  */
			
 
				+				STARPU_ASSERT(local_replicate->mc->nb_tasks_prefetch > 0);
			
 
				+				local_replicate->mc->nb_tasks_prefetch--;
			
 
				+			}
			
 
				+		}
			
 
				 		_starpu_spin_unlock(&handle->header_lock);
			
 
				 
			
 
				 		_STARPU_TASK_SET_INTERFACE(task , local_replicate->data_interface, descrs[index].index);
			
@@ -1372,7 +1409,7 @@ void _starpu_fetch_nowhere_task_input(struct _starpu_job *j)
 
				 
			
 
				 		local_replicate = get_replicate(handle, mode, -1, node);
			
 
				 
			
 
				-		_starpu_fetch_data_on_node(handle, node, local_replicate, mode, 0, 0, 1, _starpu_fetch_nowhere_task_input_cb, wrapper, 0, "_starpu_fetch_nowhere_task_input");
			
 
				+		_starpu_fetch_data_on_node(handle, node, local_replicate, mode, 0, STARPU_FETCH, 1, _starpu_fetch_nowhere_task_input_cb, wrapper, 0, "_starpu_fetch_nowhere_task_input");
			
 
				 	}
			
 
				 
			
 
				 	if (profiling && task->profiling_info)
			
--- a/src/datawizard/coherency.h
+++ b/src/datawizard/coherency.h
@@ -298,7 +298,7 @@ struct _starpu_data_state
 
				  * async means that _starpu_fetch_data_on_node will wait for completion of the request
			
 
				  */
			
 
				 int _starpu_fetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate,
			
 
				-			       enum starpu_data_access_mode mode, unsigned detached, unsigned is_prefetch, unsigned async,
			
 
				+			       enum starpu_data_access_mode mode, unsigned detached, enum _starpu_is_prefetch is_prefetch, unsigned async,
			
 
				 			       void (*callback_func)(void *), void *callback_arg, int prio, const char *origin);
			
 
				 /** This releases a reference on the handle */
			
 
				 void _starpu_release_data_on_node(struct _starpu_data_state *state, uint32_t default_wt_mask,
			
@@ -341,7 +341,7 @@ int _starpu_determine_request_path(starpu_data_handle_t handle,
 
				  */
			
 
				 struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_handle_t handle,
			
 
				 								  struct _starpu_data_replicate *dst_replicate,
			
 
				-								  enum starpu_data_access_mode mode, unsigned is_prefetch,
			
 
				+								  enum starpu_data_access_mode mode, enum _starpu_is_prefetch is_prefetch,
			
 
				 								  unsigned async,
			
 
				 								  void (*callback_func)(void *), void *callback_arg, int prio, const char *origin);
			
 
				 
			
--- a/src/datawizard/copy_driver.c
+++ b/src/datawizard/copy_driver.c
@@ -201,7 +201,7 @@ int STARPU_ATTRIBUTE_WARN_UNUSED_RESULT _starpu_driver_copy_data_1_to_1(starpu_d
 
				 									unsigned donotread,
			
 
				 									struct _starpu_data_request *req,
			
 
				 									unsigned may_alloc,
			
 
				-									unsigned prefetch STARPU_ATTRIBUTE_UNUSED)
			
 
				+									enum _starpu_is_prefetch prefetch STARPU_ATTRIBUTE_UNUSED)
			
 
				 {
			
 
				 	if (!donotread)
			
 
				 	{
			
@@ -219,7 +219,7 @@ int STARPU_ATTRIBUTE_WARN_UNUSED_RESULT _starpu_driver_copy_data_1_to_1(starpu_d
 
				 			/* We're not supposed to allocate there at the moment */
			
 
				 			return -ENOMEM;
			
 
				 
			
 
				-		int ret_alloc = _starpu_allocate_memory_on_node(handle, dst_replicate, req ? req->prefetch : 0);
			
 
				+		int ret_alloc = _starpu_allocate_memory_on_node(handle, dst_replicate, req ? req->prefetch : STARPU_FETCH);
			
 
				 		if (ret_alloc)
			
 
				 			return -ENOMEM;
			
 
				 	}
			
--- a/src/datawizard/copy_driver.h
+++ b/src/datawizard/copy_driver.h
@@ -47,6 +47,15 @@ extern "C"
 
				 struct _starpu_data_request;
			
 
				 struct _starpu_data_replicate;
			
 
				 
			
 
				+enum _starpu_is_prefetch
			
 
				+{
			
 
				+	STARPU_FETCH = 0,		/* A task really needs it now! */
			
 
				+	STARPU_TASK_PREFETCH = 1,	/* A task will need it soon */
			
 
				+	STARPU_PREFETCH = 2,		/* It is a good idea to have it asap */
			
 
				+	STARPU_IDLEFETCH = 3,		/* Get this here when you have time to */
			
 
				+	STARPU_NFETCH
			
 
				+};
			
 
				+
			
 
				 #ifdef STARPU_USE_MIC
			
 
				 /** MIC needs memory_node to know which MIC is concerned.
			
 
				  * mark is used to wait asynchronous request.
			
@@ -132,7 +141,7 @@ int _starpu_driver_copy_data_1_to_1(starpu_data_handle_t handle,
 
				 				    unsigned donotread,
			
 
				 				    struct _starpu_data_request *req,
			
 
				 				    unsigned may_alloc,
			
 
				-				    unsigned prefetch);
			
 
				+				    enum _starpu_is_prefetch prefetch);
			
 
				 
			
 
				 unsigned _starpu_driver_test_request_completion(struct _starpu_async_channel *async_channel);
			
 
				 void _starpu_driver_wait_request_completion(struct _starpu_async_channel *async_channel);
			
--- a/src/datawizard/data_request.c
+++ b/src/datawizard/data_request.c
@@ -29,7 +29,7 @@
 
				 #warning split into separate out/in queues for each node, so that MAX_PENDING_REQUESTS_PER_NODE is separate for them, since the links are bidirectionnal
			
 
				 #endif
			
 
				 static struct _starpu_data_request_prio_list data_requests[STARPU_MAXNODES];
			
 
				-static struct _starpu_data_request_prio_list prefetch_requests[STARPU_MAXNODES];
			
 
				+static struct _starpu_data_request_prio_list prefetch_requests[STARPU_MAXNODES]; /* Contains both task_prefetch and prefetch */
			
 
				 static struct _starpu_data_request_prio_list idle_requests[STARPU_MAXNODES];
			
 
				 static starpu_pthread_mutex_t data_requests_list_mutex[STARPU_MAXNODES];
			
 
				 
			
@@ -124,7 +124,7 @@ struct _starpu_data_request *_starpu_create_data_request(starpu_data_handle_t ha
 
				 							 int handling_node,
			
 
				 							 enum starpu_data_access_mode mode,
			
 
				 							 unsigned ndeps,
			
 
				-							 unsigned is_prefetch,
			
 
				+							 enum _starpu_is_prefetch is_prefetch,
			
 
				 							 int prio,
			
 
				 							 unsigned is_write_invalidation,
			
 
				 							 const char *origin)
			
@@ -156,6 +156,7 @@ struct _starpu_data_request *_starpu_create_data_request(starpu_data_handle_t ha
 
				 	STARPU_ASSERT(starpu_node_get_kind(handling_node) == STARPU_CPU_RAM || _starpu_memory_node_get_nworkers(handling_node));
			
 
				 	r->completed = 0;
			
 
				 	r->prefetch = is_prefetch;
			
 
				+	r->nb_tasks_prefetch = 0;
			
 
				 	r->prio = prio;
			
 
				 	r->retval = -1;
			
 
				 	r->ndeps = ndeps;
			
@@ -310,9 +311,9 @@ void _starpu_post_data_request(struct _starpu_data_request *r)
 
				 
			
 
				 	/* insert the request in the proper list */
			
 
				 	STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[handling_node]);
			
 
				-	if (r->prefetch == 2)
			
 
				+	if (r->prefetch >= STARPU_IDLEFETCH)
			
 
				 		_starpu_data_request_prio_list_push_back(&idle_requests[handling_node], r);
			
 
				-	else if (r->prefetch)
			
 
				+	else if (r->prefetch > STARPU_FETCH)
			
 
				 		_starpu_data_request_prio_list_push_back(&prefetch_requests[handling_node], r);
			
 
				 	else
			
 
				 		_starpu_data_request_prio_list_push_back(&data_requests[handling_node], r);
			
@@ -413,6 +414,10 @@ static void starpu_handle_data_request_completion(struct _starpu_data_request *r
 
				 	/* Remove a reference on the destination replicate for the request */
			
 
				 	if (dst_replicate)
			
 
				 	{
			
 
				+		if (dst_replicate->mc)
			
 
				+			/* Make sure it stays there for the task.  */
			
 
				+			dst_replicate->mc->nb_tasks_prefetch += r->nb_tasks_prefetch;
			
 
				+
			
 
				 		STARPU_ASSERT(dst_replicate->refcnt > 0);
			
 
				 		dst_replicate->refcnt--;
			
 
				 	}
			
@@ -463,7 +468,7 @@ static void starpu_handle_data_request_completion(struct _starpu_data_request *r
 
				 }
			
 
				 
			
 
				 /* TODO : accounting to see how much time was spent working for other people ... */
			
 
				-static int starpu_handle_data_request(struct _starpu_data_request *r, unsigned may_alloc, int prefetch)
			
 
				+static int starpu_handle_data_request(struct _starpu_data_request *r, unsigned may_alloc, enum _starpu_is_prefetch prefetch)
			
 
				 {
			
 
				 	starpu_data_handle_t handle = r->handle;
			
 
				 
			
@@ -538,7 +543,7 @@ static int starpu_handle_data_request(struct _starpu_data_request *r, unsigned m
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-static int __starpu_handle_node_data_requests(struct _starpu_data_request_prio_list *reqlist, unsigned src_node, unsigned may_alloc, unsigned n, unsigned *pushed, unsigned prefetch)
			
 
				+static int __starpu_handle_node_data_requests(struct _starpu_data_request_prio_list *reqlist, unsigned src_node, unsigned may_alloc, unsigned n, unsigned *pushed, enum _starpu_is_prefetch prefetch)
			
 
				 {
			
 
				 	struct _starpu_data_request *r;
			
 
				 	struct _starpu_data_request_prio_list new_data_requests[prefetch + 1]; /* Indexed by prefetch level */
			
@@ -609,7 +614,7 @@ static int __starpu_handle_node_data_requests(struct _starpu_data_request_prio_l
 
				 			ret = res;
			
 
				 			/* Prefetch requests might have gotten promoted while in tmp list */
			
 
				 			_starpu_data_request_prio_list_push_back(&new_data_requests[r->prefetch], r);
			
 
				-			if (prefetch)
			
 
				+			if (prefetch > STARPU_FETCH)
			
 
				 				/* Prefetching more there would make the situation even worse */
			
 
				 				break;
			
 
				 		}
			
@@ -639,20 +644,25 @@ static int __starpu_handle_node_data_requests(struct _starpu_data_request_prio_l
 
				 	if (i <= prefetch)
			
 
				 	{
			
 
				 		STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[src_node]);
			
 
				-		if (!(_starpu_data_request_prio_list_empty(&new_data_requests[0])))
			
 
				+		if (!(_starpu_data_request_prio_list_empty(&new_data_requests[STARPU_FETCH])))
			
 
				+		{
			
 
				+			_starpu_data_request_prio_list_push_prio_list_back(&new_data_requests[STARPU_FETCH], &data_requests[src_node]);
			
 
				+			data_requests[src_node] = new_data_requests[STARPU_FETCH];
			
 
				+		}
			
 
				+		if (prefetch >= STARPU_TASK_PREFETCH && !(_starpu_data_request_prio_list_empty(&new_data_requests[STARPU_TASK_PREFETCH])))
			
 
				 		{
			
 
				-			_starpu_data_request_prio_list_push_prio_list_back(&new_data_requests[0], &data_requests[src_node]);
			
 
				-			data_requests[src_node] = new_data_requests[0];
			
 
				+			_starpu_data_request_prio_list_push_prio_list_back(&new_data_requests[STARPU_TASK_PREFETCH], &prefetch_requests[src_node]);
			
 
				+			prefetch_requests[src_node] = new_data_requests[STARPU_TASK_PREFETCH];
			
 
				 		}
			
 
				-		if (prefetch >= 1 && !(_starpu_data_request_prio_list_empty(&new_data_requests[1])))
			
 
				+		if (prefetch >= STARPU_PREFETCH && !(_starpu_data_request_prio_list_empty(&new_data_requests[STARPU_PREFETCH])))
			
 
				 		{
			
 
				-			_starpu_data_request_prio_list_push_prio_list_back(&new_data_requests[1], &prefetch_requests[src_node]);
			
 
				-			prefetch_requests[src_node] = new_data_requests[1];
			
 
				+			_starpu_data_request_prio_list_push_prio_list_back(&new_data_requests[STARPU_PREFETCH], &prefetch_requests[src_node]);
			
 
				+			prefetch_requests[src_node] = new_data_requests[STARPU_PREFETCH];
			
 
				 		}
			
 
				-		if (prefetch >= 2 && !(_starpu_data_request_prio_list_empty(&new_data_requests[2])))
			
 
				+		if (prefetch >= STARPU_IDLEFETCH && !(_starpu_data_request_prio_list_empty(&new_data_requests[STARPU_IDLEFETCH])))
			
 
				 		{
			
 
				-			_starpu_data_request_prio_list_push_prio_list_back(&new_data_requests[2], &idle_requests[src_node]);
			
 
				-			idle_requests[src_node] = new_data_requests[2];
			
 
				+			_starpu_data_request_prio_list_push_prio_list_back(&new_data_requests[STARPU_IDLEFETCH], &idle_requests[src_node]);
			
 
				+			idle_requests[src_node] = new_data_requests[STARPU_IDLEFETCH];
			
 
				 		}
			
 
				 		STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[src_node]);
			
 
				 
			
@@ -678,17 +688,17 @@ static int __starpu_handle_node_data_requests(struct _starpu_data_request_prio_l
 
				 
			
 
				 int _starpu_handle_node_data_requests(unsigned src_node, unsigned may_alloc, unsigned *pushed)
			
 
				 {
			
 
				-	return __starpu_handle_node_data_requests(data_requests, src_node, may_alloc, MAX_PENDING_REQUESTS_PER_NODE, pushed, 0);
			
 
				+	return __starpu_handle_node_data_requests(data_requests, src_node, may_alloc, MAX_PENDING_REQUESTS_PER_NODE, pushed, STARPU_FETCH);
			
 
				 }
			
 
				 
			
 
				 int _starpu_handle_node_prefetch_requests(unsigned src_node, unsigned may_alloc, unsigned *pushed)
			
 
				 {
			
 
				-	return __starpu_handle_node_data_requests(prefetch_requests, src_node, may_alloc, MAX_PENDING_PREFETCH_REQUESTS_PER_NODE, pushed, 1);
			
 
				+	return __starpu_handle_node_data_requests(prefetch_requests, src_node, may_alloc, MAX_PENDING_PREFETCH_REQUESTS_PER_NODE, pushed, STARPU_PREFETCH);
			
 
				 }
			
 
				 
			
 
				 int _starpu_handle_node_idle_requests(unsigned src_node, unsigned may_alloc, unsigned *pushed)
			
 
				 {
			
 
				-	return __starpu_handle_node_data_requests(idle_requests, src_node, may_alloc, MAX_PENDING_IDLE_REQUESTS_PER_NODE, pushed, 2);
			
 
				+	return __starpu_handle_node_data_requests(idle_requests, src_node, may_alloc, MAX_PENDING_IDLE_REQUESTS_PER_NODE, pushed, STARPU_IDLEFETCH);
			
 
				 }
			
 
				 
			
 
				 static int _handle_pending_node_data_requests(unsigned src_node, unsigned force)
			
@@ -839,11 +849,15 @@ int _starpu_check_that_no_data_request_is_pending(unsigned node)
 
				 }
			
 
				 
			
 
				 
			
 
				-void _starpu_update_prefetch_status(struct _starpu_data_request *r, unsigned prefetch)
			
 
				+void _starpu_update_prefetch_status(struct _starpu_data_request *r, enum _starpu_is_prefetch prefetch)
			
 
				 {
			
 
				 	STARPU_ASSERT(r->prefetch > prefetch);
			
 
				 	r->prefetch=prefetch;
			
 
				 
			
 
				+	if (prefetch >= STARPU_IDLEFETCH)
			
 
				+		/* No possible actual change */
			
 
				+		return;
			
 
				+
			
 
				 	/* We have to promote chained_request too! */
			
 
				 	unsigned chained_req;
			
 
				 	for (chained_req = 0; chained_req < r->next_req_count; chained_req++)
			
@@ -855,19 +869,20 @@ void _starpu_update_prefetch_status(struct _starpu_data_request *r, unsigned pre
 
				 
			
 
				 	STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[r->handling_node]);
			
 
				 
			
 
				+	int found = 1;
			
 
				+
			
 
				 	/* The request can be in a different list (handling request or the temp list)
			
 
				-	 * we have to check that it is really in the prefetch list. */
			
 
				+	 * we have to check that it is really in the prefetch or idle list. */
			
 
				 	if (_starpu_data_request_prio_list_ismember(&prefetch_requests[r->handling_node], r))
			
 
				-	{
			
 
				-		_starpu_data_request_prio_list_erase(&prefetch_requests[r->handling_node],r);
			
 
				-		_starpu_data_request_prio_list_push_back(&data_requests[r->handling_node],r);
			
 
				-	}
			
 
				-	/* The request can be in a different list (handling request or the temp list)
			
 
				-	 * we have to check that it is really in the idle list. */
			
 
				+		_starpu_data_request_prio_list_erase(&prefetch_requests[r->handling_node], r);
			
 
				 	else if (_starpu_data_request_prio_list_ismember(&idle_requests[r->handling_node], r))
			
 
				+		_starpu_data_request_prio_list_erase(&idle_requests[r->handling_node], r);
			
 
				+	else
			
 
				+		found = 0;
			
 
				+
			
 
				+	if (found)
			
 
				 	{
			
 
				-		_starpu_data_request_prio_list_erase(&idle_requests[r->handling_node],r);
			
 
				-		if (prefetch == 1)
			
 
				+		if (prefetch > STARPU_FETCH)
			
 
				 			_starpu_data_request_prio_list_push_back(&prefetch_requests[r->handling_node],r);
			
 
				 		else
			
 
				 			_starpu_data_request_prio_list_push_back(&data_requests[r->handling_node],r);
			
--- a/src/datawizard/data_request.h
+++ b/src/datawizard/data_request.h
@@ -79,12 +79,11 @@ LIST_TYPE(_starpu_data_request,
 
				 	/** Whether the transfer is completed. */
			
 
				 	unsigned completed;
			
 
				 
			
 
				-	/** Whether this is just a prefetch request:
			
 
				-	 * 0 for fetch,
			
 
				-	 * 1 for prefetch (dependencies have just been released)
			
 
				-	 * 2 for idle (a good idea to do it some time, but no hurry at all)
			
 
				-	 */
			
 
				-	unsigned prefetch;
			
 
				+	/** Whether this is just a prefetch request */
			
 
				+	enum _starpu_is_prefetch prefetch;
			
 
				+
			
 
				+	/** Number of tasks which used this as a prefetch */
			
 
				+	unsigned nb_tasks_prefetch;
			
 
				 
			
 
				 	/** Priority of the request. Default is 0 */
			
 
				 	int prio;
			
@@ -151,7 +150,7 @@ struct _starpu_data_request *_starpu_create_data_request(starpu_data_handle_t ha
 
				 							 int handling_node,
			
 
				 							 enum starpu_data_access_mode mode,
			
 
				 							 unsigned ndeps,
			
 
				-							 unsigned is_prefetch,
			
 
				+							 enum _starpu_is_prefetch is_prefetch,
			
 
				 							 int prio,
			
 
				 							 unsigned is_write_invalidation,
			
 
				 							 const char *origin) STARPU_ATTRIBUTE_MALLOC;
			
@@ -162,5 +161,5 @@ void _starpu_data_request_append_callback(struct _starpu_data_request *r,
 
				 					  void (*callback_func)(void *),
			
 
				 					  void *callback_arg);
			
 
				 
			
 
				-void _starpu_update_prefetch_status(struct _starpu_data_request *r, unsigned prefetch);
			
 
				+void _starpu_update_prefetch_status(struct _starpu_data_request *r, enum _starpu_is_prefetch prefetch);
			
 
				 #endif // __DATA_REQUEST_H__
			
--- a/src/datawizard/filters.c
+++ b/src/datawizard/filters.c
@@ -193,7 +193,7 @@ static void _starpu_data_partition(starpu_data_handle_t initial_handle, starpu_d
 
				 		int home_node = initial_handle->home_node;
			
 
				 		if (home_node < 0 || (starpu_node_get_kind(home_node) != STARPU_CPU_RAM))
			
 
				 			home_node = STARPU_MAIN_RAM;
			
 
				-		int ret = _starpu_allocate_memory_on_node(initial_handle, &initial_handle->per_node[home_node], 0);
			
 
				+		int ret = _starpu_allocate_memory_on_node(initial_handle, &initial_handle->per_node[home_node], STARPU_FETCH);
			
 
				 #ifdef STARPU_DEVEL
			
 
				 #warning we should reclaim memory if allocation failed
			
 
				 #endif
			
--- a/src/datawizard/interfaces/data_interface.c
+++ b/src/datawizard/interfaces/data_interface.c
@@ -693,7 +693,7 @@ void _starpu_check_if_valid_and_fetch_data_on_node(starpu_data_handle_t handle,
 
				 	}
			
 
				 	if (valid)
			
 
				 	{
			
 
				-		int ret = _starpu_fetch_data_on_node(handle, handle->home_node, replicate, STARPU_R, 0, 0, 0, NULL, NULL, 0, origin);
			
 
				+		int ret = _starpu_fetch_data_on_node(handle, handle->home_node, replicate, STARPU_R, 0, STARPU_FETCH, 0, NULL, NULL, 0, origin);
			
 
				 		STARPU_ASSERT(!ret);
			
 
				 		_starpu_release_data_on_node(handle, handle->home_node, replicate);
			
 
				 	}
			
--- a/src/datawizard/memalloc.c
+++ b/src/datawizard/memalloc.c
@@ -322,7 +322,7 @@ static int STARPU_ATTRIBUTE_WARN_UNUSED_RESULT transfer_subtree_to_node(starpu_d
 
				 		{
			
 
				 			/* This is the only copy, push it to destination */
			
 
				 			struct _starpu_data_request *r;
			
 
				-			r = _starpu_create_request_to_fetch_data(handle, dst_replicate, STARPU_R, 0, 0, NULL, NULL, 0, "transfer_subtree_to_node");
			
 
				+			r = _starpu_create_request_to_fetch_data(handle, dst_replicate, STARPU_R, STARPU_FETCH, 0, NULL, NULL, 0, "transfer_subtree_to_node");
			
 
				 			/* There is no way we don't need a request, since
			
 
				 			 * source is OWNER, destination can't be having it */
			
 
				 			STARPU_ASSERT(r);
			
@@ -546,7 +546,7 @@ static void reuse_mem_chunk(unsigned node, struct _starpu_data_replicate *new_re
 
				 /* This function is called for memory chunks that are possibly in used (ie. not
			
 
				  * in the cache). They should therefore still be associated to a handle. */
			
 
				 /* mc_lock is held and may be temporarily released! */
			
 
				-static size_t try_to_throw_mem_chunk(struct _starpu_mem_chunk *mc, unsigned node, struct _starpu_data_replicate *replicate, unsigned is_already_in_mc_list)
			
 
				+static size_t try_to_throw_mem_chunk(struct _starpu_mem_chunk *mc, unsigned node, struct _starpu_data_replicate *replicate, unsigned is_already_in_mc_list, enum _starpu_is_prefetch is_prefetch)
			
 
				 {
			
 
				 	size_t freed = 0;
			
 
				 
			
@@ -571,6 +571,10 @@ static size_t try_to_throw_mem_chunk(struct _starpu_mem_chunk *mc, unsigned node
 
				 		/* Hasn't been used yet, avoid evicting it */
			
 
				 		return 0;
			
 
				 
			
 
				+	if (mc->nb_tasks_prefetch && is_prefetch >= STARPU_TASK_PREFETCH)
			
 
				+		/* We have not finished executing the tasks this was prefetched for */
			
 
				+		return 0;
			
 
				+
			
 
				 	/* REDUX memchunk */
			
 
				 	if (mc->relaxed_coherency == 2)
			
 
				 	{
			
@@ -782,7 +786,7 @@ static int try_to_find_reusable_mc(unsigned node, starpu_data_handle_t data, str
 
				 
			
 
				 /* this function looks for a memory chunk that matches a given footprint in the
			
 
				  * list of mem chunk that are not important */
			
 
				-static int try_to_reuse_not_important_mc(unsigned node, starpu_data_handle_t data, struct _starpu_data_replicate *replicate, uint32_t footprint)
			
 
				+static int try_to_reuse_not_important_mc(unsigned node, starpu_data_handle_t data, struct _starpu_data_replicate *replicate, uint32_t footprint, enum _starpu_is_prefetch is_prefetch)
			
 
				 {
			
 
				 	struct _starpu_mem_chunk *mc, *orig_next_mc, *next_mc;
			
 
				 	int success = 0;
			
@@ -816,7 +820,7 @@ restart:
 
				 		}
			
 
				 
			
 
				 		/* Note: this may unlock mc_list! */
			
 
				-		success = try_to_throw_mem_chunk(mc, node, replicate, 1);
			
 
				+		success = try_to_throw_mem_chunk(mc, node, replicate, 1, is_prefetch);
			
 
				 
			
 
				 		if (orig_next_mc)
			
 
				 		{
			
@@ -841,11 +845,14 @@ restart:
 
				  * Try to find a buffer currently in use on the memory node which has the given
			
 
				  * footprint.
			
 
				  */
			
 
				-static int try_to_reuse_potentially_in_use_mc(unsigned node, starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, uint32_t footprint, int is_prefetch)
			
 
				+static int try_to_reuse_potentially_in_use_mc(unsigned node, starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, uint32_t footprint, enum _starpu_is_prefetch is_prefetch)
			
 
				 {
			
 
				 	struct _starpu_mem_chunk *mc, *next_mc, *orig_next_mc;
			
 
				 	int success = 0;
			
 
				 
			
 
				+	if (is_prefetch >= STARPU_IDLEFETCH)
			
 
				+		/* Do not evict a MC just for an idle fetch */
			
 
				+		return 0;
			
 
				 	/*
			
 
				 	 * We have to unlock mc_lock before locking header_lock, so we have
			
 
				 	 * to be careful with the list.  We try to do just one pass, by
			
@@ -868,14 +875,11 @@ restart:
 
				 		if (mc->remove_notify)
			
 
				 			/* Somebody already working here, skip */
			
 
				 			continue;
			
 
				-		if (is_prefetch > 1)
			
 
				-			/* Do not evict a MC just for an idle fetch */
			
 
				-			continue;
			
 
				-		if (is_prefetch == 1 && !mc->wontuse)
			
 
				+		if (!mc->wontuse && is_prefetch >= STARPU_PREFETCH)
			
 
				 			/* Do not evict something that we might reuse, just for a prefetch */
			
 
				-			/* FIXME: but perhaps we won't have any task using it in
			
 
				-                         * the close future, we should perhaps rather check
			
 
				-                         * mc->replicate->refcnt? */
			
 
				+			continue;
			
 
				+		if (mc->nb_tasks_prefetch && is_prefetch >= STARPU_TASK_PREFETCH)
			
 
				+			/* Do not evict something that we will reuse, just for a task prefetch */
			
 
				 			continue;
			
 
				 		if (mc->footprint != footprint || _starpu_data_interface_compare(handle->per_node[node].data_interface, handle->ops, mc->data->per_node[node].data_interface, mc->ops) != 1)
			
 
				 			/* Not the right type of interface, skip */
			
@@ -889,7 +893,7 @@ restart:
 
				 		}
			
 
				 
			
 
				 		/* Note: this may unlock mc_list! */
			
 
				-		success = try_to_throw_mem_chunk(mc, node, replicate, 1);
			
 
				+		success = try_to_throw_mem_chunk(mc, node, replicate, 1, is_prefetch);
			
 
				 
			
 
				 		if (orig_next_mc)
			
 
				 		{
			
@@ -999,7 +1003,7 @@ restart2:
 
				 				next_mc->remove_notify = &next_mc;
			
 
				 			}
			
 
				 			/* Note: this may unlock mc_list! */
			
 
				-			freed += try_to_throw_mem_chunk(mc, node, NULL, 0);
			
 
				+			freed += try_to_throw_mem_chunk(mc, node, NULL, 0, STARPU_FETCH);
			
 
				 
			
 
				 			if (orig_next_mc)
			
 
				 			{
			
@@ -1218,7 +1222,7 @@ void starpu_memchunk_tidy(unsigned node)
 
				 			}
			
 
				 
			
 
				 			_starpu_spin_unlock(&mc_lock[node]);
			
 
				-			if (!_starpu_create_request_to_fetch_data(handle, &handle->per_node[target_node], STARPU_R, 2, 1, NULL, NULL, 0, "starpu_memchunk_tidy"))
			
 
				+			if (!_starpu_create_request_to_fetch_data(handle, &handle->per_node[target_node], STARPU_R, STARPU_IDLEFETCH, 1, NULL, NULL, 0, "starpu_memchunk_tidy"))
			
 
				 			{
			
 
				 				/* No request was actually needed??
			
 
				 				 * Odd, but cope with it.  */
			
@@ -1317,6 +1321,7 @@ static struct _starpu_mem_chunk *_starpu_memchunk_init(struct _starpu_data_repli
 
				 	mc->size_interface = interface_size;
			
 
				 	mc->remove_notify = NULL;
			
 
				 	mc->diduse = 0;
			
 
				+	mc->nb_tasks_prefetch = 0;
			
 
				 	mc->wontuse = 0;
			
 
				 
			
 
				 	return mc;
			
@@ -1430,7 +1435,7 @@ void _starpu_request_mem_chunk_removal(starpu_data_handle_t handle, struct _star
 
				  *
			
 
				  */
			
 
				 
			
 
				-static starpu_ssize_t _starpu_allocate_interface(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, unsigned dst_node, unsigned is_prefetch)
			
 
				+static starpu_ssize_t _starpu_allocate_interface(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, unsigned dst_node, enum _starpu_is_prefetch is_prefetch)
			
 
				 {
			
 
				 	unsigned attempts = 0;
			
 
				 	starpu_ssize_t allocated_memory;
			
@@ -1514,7 +1519,7 @@ static starpu_ssize_t _starpu_allocate_interface(starpu_data_handle_t handle, st
 
				 			reclaim -= freed;
			
 
				 
			
 
				 			/* Try to reuse an allocated data with the same interface (to avoid spurious free/alloc) */
			
 
				-			if (_starpu_has_not_important_data && try_to_reuse_not_important_mc(dst_node, handle, replicate, footprint))
			
 
				+			if (_starpu_has_not_important_data && try_to_reuse_not_important_mc(dst_node, handle, replicate, footprint, is_prefetch))
			
 
				 				break;
			
 
				 			if (try_to_reuse_potentially_in_use_mc(dst_node, handle, replicate, footprint, is_prefetch))
			
 
				 			{
			
@@ -1596,7 +1601,7 @@ out:
 
				 	return allocated_memory;
			
 
				 }
			
 
				 
			
 
				-int _starpu_allocate_memory_on_node(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, unsigned is_prefetch)
			
 
				+int _starpu_allocate_memory_on_node(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, enum _starpu_is_prefetch is_prefetch)
			
 
				 {
			
 
				 	starpu_ssize_t allocated_memory;
			
 
				 
			
--- a/src/datawizard/memalloc.h
+++ b/src/datawizard/memalloc.h
@@ -26,6 +26,7 @@
 
				 #include <datawizard/interfaces/data_interface.h>
			
 
				 #include <datawizard/coherency.h>
			
 
				 #include <datawizard/copy_driver.h>
			
 
				+#include <datawizard/data_request.h>
			
 
				 
			
 
				 struct _starpu_data_replicate;
			
 
				 
			
@@ -59,10 +60,17 @@ LIST_TYPE(_starpu_mem_chunk,
 
				 	/** Whether the memchunk is in the clean part of the mc_list */
			
 
				 	unsigned clean:1;
			
 
				 	/** Was this chunk used since it got allocated?  */
			
 
				+	/* FIXME: probably useless now with nb_tasks_prefetch */
			
 
				 	unsigned diduse:1;
			
 
				 	/** Was this chunk marked as "won't use"? */
			
 
				 	unsigned wontuse:1;
			
 
				 
			
 
				+	/** The number of prefetches that we made for this mc for various tasks
			
 
				+	 * This is also the number of tasks that we will wait to see use this mc before
			
 
				+	 * we attempt to evict it.
			
 
				+	 */
			
 
				+	unsigned nb_tasks_prefetch;
			
 
				+
			
 
				 	/** the size of the data is only set when calling _starpu_request_mem_chunk_removal(),
			
 
				 	 * it is needed to estimate how much memory is in mc_cache, and by
			
 
				 	 * free_memory_on_node() which is called when the handle is no longer
			
@@ -84,7 +92,7 @@ void _starpu_init_mem_chunk_lists(void);
 
				 void _starpu_deinit_mem_chunk_lists(void);
			
 
				 void _starpu_mem_chunk_init_last(void);
			
 
				 void _starpu_request_mem_chunk_removal(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, unsigned node, size_t size);
			
 
				-int _starpu_allocate_memory_on_node(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, unsigned is_prefetch);
			
 
				+int _starpu_allocate_memory_on_node(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, enum _starpu_is_prefetch is_prefetch);
			
 
				 size_t _starpu_free_all_automatically_allocated_buffers(unsigned node);
			
 
				 void _starpu_memchunk_recently_used(struct _starpu_mem_chunk *mc, unsigned node);
			
 
				 void _starpu_memchunk_wont_use(struct _starpu_mem_chunk *m, unsigned nodec);
			
--- a/src/datawizard/user_interactions.c
+++ b/src/datawizard/user_interactions.c
@@ -47,7 +47,7 @@ int starpu_data_request_allocation(starpu_data_handle_t handle, unsigned node)
 
				 
			
 
				 	_starpu_spin_lock(&handle->header_lock);
			
 
				 
			
 
				-	r = _starpu_create_data_request(handle, NULL, &handle->per_node[node], node, STARPU_NONE, 0, 1, 0, 0, "starpu_data_request_allocation");
			
 
				+	r = _starpu_create_data_request(handle, NULL, &handle->per_node[node], node, STARPU_NONE, 0, STARPU_PREFETCH, 0, 0, "starpu_data_request_allocation");
			
 
				 
			
 
				 	/* we do not increase the refcnt associated to the request since we are
			
 
				 	 * not waiting for its termination */
			
@@ -68,7 +68,7 @@ struct user_interaction_wrapper
 
				 	starpu_pthread_mutex_t lock;
			
 
				 	unsigned finished;
			
 
				 	unsigned detached;
			
 
				-	unsigned prefetch;
			
 
				+	enum _starpu_is_prefetch prefetch;
			
 
				 	unsigned async;
			
 
				 	int prio;
			
 
				 	void (*callback)(void *);
			
@@ -536,7 +536,7 @@ static void _prefetch_data_on_node(void *arg)
 
				 }
			
 
				 
			
 
				 static
			
 
				-int _starpu_prefetch_data_on_node_with_mode(starpu_data_handle_t handle, unsigned node, unsigned async, enum starpu_data_access_mode mode, unsigned prefetch, int prio)
			
 
				+int _starpu_prefetch_data_on_node_with_mode(starpu_data_handle_t handle, unsigned node, unsigned async, enum starpu_data_access_mode mode, enum _starpu_is_prefetch prefetch, int prio)
			
 
				 {
			
 
				 	STARPU_ASSERT(handle);
			
 
				 
			
@@ -596,12 +596,12 @@ int _starpu_prefetch_data_on_node_with_mode(starpu_data_handle_t handle, unsigne
 
				 
			
 
				 int starpu_data_fetch_on_node(starpu_data_handle_t handle, unsigned node, unsigned async)
			
 
				 {
			
 
				-	return _starpu_prefetch_data_on_node_with_mode(handle, node, async, STARPU_R, 0, 0);
			
 
				+	return _starpu_prefetch_data_on_node_with_mode(handle, node, async, STARPU_R, STARPU_FETCH, 0);
			
 
				 }
			
 
				 
			
 
				 int starpu_data_prefetch_on_node_prio(starpu_data_handle_t handle, unsigned node, unsigned async, int prio)
			
 
				 {
			
 
				-	return _starpu_prefetch_data_on_node_with_mode(handle, node, async, STARPU_R, 1, prio);
			
 
				+	return _starpu_prefetch_data_on_node_with_mode(handle, node, async, STARPU_R, STARPU_PREFETCH, prio);
			
 
				 }
			
 
				 
			
 
				 int starpu_data_prefetch_on_node(starpu_data_handle_t handle, unsigned node, unsigned async)
			
@@ -611,7 +611,7 @@ int starpu_data_prefetch_on_node(starpu_data_handle_t handle, unsigned node, uns
 
				 
			
 
				 int starpu_data_idle_prefetch_on_node_prio(starpu_data_handle_t handle, unsigned node, unsigned async, int prio)
			
 
				 {
			
 
				-	return _starpu_prefetch_data_on_node_with_mode(handle, node, async, STARPU_R, 2, prio);
			
 
				+	return _starpu_prefetch_data_on_node_with_mode(handle, node, async, STARPU_R, STARPU_IDLEFETCH, prio);
			
 
				 }
			
 
				 
			
 
				 int starpu_data_idle_prefetch_on_node(starpu_data_handle_t handle, unsigned node, unsigned async)
			
--- a/src/datawizard/write_back.c
+++ b/src/datawizard/write_back.c
@@ -64,7 +64,7 @@ void _starpu_write_through_data(starpu_data_handle_t handle, unsigned requesting
 
				 
			
 
				 				struct _starpu_data_request *r;
			
 
				 				r = _starpu_create_request_to_fetch_data(handle, &handle->per_node[node],
			
 
				-									 STARPU_R, 2, 1, wt_callback, handle, 0, "_starpu_write_through_data");
			
 
				+									 STARPU_R, STARPU_IDLEFETCH, 1, wt_callback, handle, 0, "_starpu_write_through_data");
			
 
				 
			
 
				 			        /* If no request was created, the handle was already up-to-date on the
			
 
				 			         * node */
			
--- a/src/debug/latency.c
+++ b/src/debug/latency.c
@@ -34,7 +34,7 @@ void _starpu_benchmark_ping_pong(starpu_data_handle_t handle,
 
				 		_starpu_spin_unlock(&handle->header_lock);
			
 
				 
			
 
				 		struct _starpu_data_replicate *replicate_0 = &handle->per_node[node0];
			
 
				-		ret = _starpu_fetch_data_on_node(handle, node0, replicate_0, STARPU_RW, 0, 0, 0, NULL, NULL, 0, "_starpu_benchmark_ping_pong");
			
 
				+		ret = _starpu_fetch_data_on_node(handle, node0, replicate_0, STARPU_RW, 0, STARPU_FETCH, 0, NULL, NULL, 0, "_starpu_benchmark_ping_pong");
			
 
				 		STARPU_ASSERT(!ret);
			
 
				 		_starpu_release_data_on_node(handle, node0, replicate_0);
			
 
				 
			
@@ -44,7 +44,7 @@ void _starpu_benchmark_ping_pong(starpu_data_handle_t handle,
 
				 		_starpu_spin_unlock(&handle->header_lock);
			
 
				 
			
 
				 		struct _starpu_data_replicate *replicate_1 = &handle->per_node[node1];
			
 
				-		ret = _starpu_fetch_data_on_node(handle, node1, replicate_1, STARPU_RW, 0, 0, 0, NULL, NULL, 0, "_starpu_benchmark_ping_pong");
			
 
				+		ret = _starpu_fetch_data_on_node(handle, node1, replicate_1, STARPU_RW, 0, STARPU_FETCH, 0, NULL, NULL, 0, "_starpu_benchmark_ping_pong");
			
 
				 		STARPU_ASSERT(!ret);
			
 
				 		_starpu_release_data_on_node(handle, node1, replicate_1);
			
 
				 	}
			
--- a/src/debug/traces/starpu_fxt.c
+++ b/src/debug/traces/starpu_fxt.c
@@ -20,6 +20,7 @@
 
				 #include <starpu.h>
			
 
				 #include <common/config.h>
			
 
				 #include <common/uthash.h>
			
 
				+#include <datawizard/copy_driver.h>
			
 
				 #include <string.h>
			
 
				 
			
 
				 #ifdef STARPU_HAVE_POTI
			
@@ -2268,13 +2269,14 @@ static void handle_mpi_data_set_tag(struct fxt_ev_64 *ev, struct starpu_fxt_opti
 
				 	data->mpi_tag = tag;
			
 
				 }
			
 
				 
			
 
				-static const char *copy_link_type(unsigned prefetch)
			
 
				+static const char *copy_link_type(enum _starpu_is_prefetch prefetch)
			
 
				 {
			
 
				 	switch (prefetch)
			
 
				 	{
			
 
				-		case 0: return "F";
			
 
				-		case 1: return "PF";
			
 
				-		case 2: return "IF";
			
 
				+		case STARPU_FETCH: return "F";
			
 
				+		case STARPU_TASK_PREFETCH: return "TF";
			
 
				+		case STARPU_PREFETCH: return "PF";
			
 
				+		case STARPU_IDLEFETCH: return "IF";
			
 
				 		default: STARPU_ASSERT(0);
			
 
				 	}
			
 
				 }
			
@@ -2285,7 +2287,7 @@ static void handle_start_driver_copy(struct fxt_ev_64 *ev, struct starpu_fxt_opt
 
				 	unsigned dst = ev->param[1];
			
 
				 	unsigned size = ev->param[2];
			
 
				 	unsigned comid = ev->param[3];
			
 
				-	unsigned prefetch = ev->param[4];
			
 
				+	enum _starpu_is_prefetch prefetch = ev->param[4];
			
 
				 	unsigned long handle = ev->param[5];
			
 
				 	const char *link_type = copy_link_type(prefetch);
			
 
				 
			
@@ -2367,7 +2369,7 @@ static void handle_end_driver_copy(struct fxt_ev_64 *ev, struct starpu_fxt_optio
 
				 	unsigned dst = ev->param[1];
			
 
				 	unsigned long size = ev->param[2];
			
 
				 	unsigned comid = ev->param[3];
			
 
				-	unsigned prefetch = ev->param[4];
			
 
				+	enum _starpu_is_prefetch prefetch = ev->param[4];
			
 
				 	const char *link_type = copy_link_type(prefetch);
			
 
				 
			
 
				 	char *prefix = options->file_prefix;
			
--- a/src/debug/traces/starpu_paje.c
+++ b/src/debug/traces/starpu_paje.c
@@ -398,6 +398,7 @@ void _starpu_fxt_write_paje_header(FILE *file STARPU_ATTRIBUTE_UNUSED, struct st
 
				 	/* Link types */
			
 
				 	poti_DefineLinkType("MPIL", "MPIP", "MPICt", "MPICt", "MPI communication");
			
 
				 	poti_DefineLinkType("F", "P", "Mm", "Mm", "Intra-node data Fetch");
			
 
				+	poti_DefineLinkType("TF", "P", "Mm", "Mm", "Intra-node data TaskPreFetch");
			
 
				 	poti_DefineLinkType("PF", "P", "Mm", "Mm", "Intra-node data PreFetch");
			
 
				 	poti_DefineLinkType("IF", "P", "Mm", "Mm", "Intra-node data IdleFetch");
			
 
				 	poti_DefineLinkType("WSL", "P", "W", "W", "Work steal");
			
@@ -551,6 +552,7 @@ void _starpu_fxt_write_paje_header(FILE *file STARPU_ATTRIBUTE_UNUSED, struct st
 
				 6       No       MS     Nothing         \".0 .0 .0\"		\n\
			
 
				 5       MPIL     MPIP	MPICt	MPICt   \"MPI communication\"\n\
			
 
				 5       F       P	Mm	Mm      \"Intra-node data Fetch\"\n\
			
 
				+5       TF      P	Mm	Mm      \"Intra-node data TaskPreFetch\"\n\
			
 
				 5       PF      P	Mm	Mm      \"Intra-node data PreFetch\"\n\
			
 
				 5       IF      P	Mm	Mm      \"Intra-node data IdleFetch\"\n\
			
 
				 5       WSL     P	W	W       \"Work steal\"\n");
			
--- a/src/drivers/cuda/driver_cuda.c
+++ b/src/drivers/cuda/driver_cuda.c
@@ -537,7 +537,7 @@ static int start_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *worke
 
				 				async ? &task_finished[workerid][pipeline_idx] : NULL);
			
 
				 		}
			
 
				 #else
			
 
				-#ifdef HAVE_LIBNVIDIA_ML
			
 
				+#ifdef HAVE_NVMLDEVICEGETTOTALENERGYCONSUMPTION
			
 
				 		unsigned long long energy_start = 0;
			
 
				 		nvmlReturn_t nvmlRet = -1;
			
 
				 		if (profiling && task->profiling_info)
			
@@ -561,7 +561,7 @@ static void finish_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *wor
 
				 	int profiling = starpu_profiling_status_get();
			
 
				 
			
 
				 
			
 
				-#ifdef HAVE_LIBNVIDIA_ML
			
 
				+#ifdef HAVE_NVMLDEVICEGETTOTALENERGYCONSUMPTION
			
 
				 	if (profiling && j->task->profiling_info && j->task->profiling_info->energy_consumed)
			
 
				 	{
			
 
				 		unsigned long long energy_end;
			
--- a/src/sched_policies/component_sched.c
+++ b/src/sched_policies/component_sched.c
@@ -160,7 +160,6 @@ void starpu_sched_component_prefetch_on_node(struct starpu_sched_component * com
 
				 		int worker = starpu_bitmap_first(&component->workers_in_ctx);
			
 
				 		unsigned memory_node = starpu_worker_get_memory_node(worker);
			
 
				 		starpu_prefetch_task_input_on_node(task, memory_node);
			
 
				-		task->prefetched = 1;
			
 
				 	}
			
 
				 }
			
 
				 
			
--- a/starpufft/src/cudax_kernels.cu
+++ b/starpufft/src/cudax_kernels.cu
@@ -30,12 +30,16 @@
 
				 	{			   \
			
 
				 		dim3 dimGrid(n); \
			
 
				 		func <<<dimGrid, 1, 0, starpu_cuda_get_local_stream()>>> args; \
			
 
				+		cudaError_t status = cudaGetLastError(); \
			
 
				+		if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status); \
			
 
				 	} 					\
			
 
				 	else 					\
			
 
				 	{				     \
			
 
				 		dim3 dimGrid(n / threads_per_block); \
			
 
				 		dim3 dimBlock(threads_per_block); \
			
 
				 		func <<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>> args; \
			
 
				+		cudaError_t status = cudaGetLastError(); \
			
 
				+		if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status); \
			
 
				 	} \
			
 
				 	cudaStreamSynchronize(starpu_cuda_get_local_stream()); \
			
 
				 
			
@@ -85,12 +89,16 @@ extern "C" void STARPUFFT(cuda_twiddle_1d_host)(_cuComplex *out, const _cuComple
 
				 		{			    \
			
 
				 			dim3 dimGrid(n, m); \
			
 
				 			func <<<dimGrid, 1, 0, starpu_cuda_get_local_stream()>>> args; \
			
 
				+			cudaError_t status = cudaGetLastError(); \
			
 
				+			if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status); \
			
 
				 		} \
			
 
				 		else \
			
 
				 		{					      \
			
 
				 			dim3 dimGrid(1, m / threads_per_dim); \
			
 
				 			dim3 dimBlock(n, threads_per_dim); \
			
 
				 			func <<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>> args; \
			
 
				+			cudaError_t status = cudaGetLastError(); \
			
 
				+			if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status); \
			
 
				 		} \
			
 
				 	} \
			
 
				 	else \
			
@@ -100,12 +108,16 @@ extern "C" void STARPUFFT(cuda_twiddle_1d_host)(_cuComplex *out, const _cuComple
 
				 			dim3 dimGrid(n / threads_per_dim, 1); \
			
 
				 			dim3 dimBlock(threads_per_dim, m); \
			
 
				 			func <<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>> args; \
			
 
				+			cudaError_t status = cudaGetLastError(); \
			
 
				+			if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status); \
			
 
				 		} \
			
 
				 		else \
			
 
				 		{							\
			
 
				 			dim3 dimGrid(n / threads_per_dim, m / threads_per_dim); \
			
 
				 			dim3 dimBlock(threads_per_dim, threads_per_dim); \
			
 
				 			func <<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>> args; \
			
 
				+			cudaError_t status = cudaGetLastError(); \
			
 
				+			if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status); \
			
 
				 		} \
			
 
				 	} \
			
 
				 	cudaStreamSynchronize(starpu_cuda_get_local_stream()); \
			
--- a/starpurm/dev/cuda_vector_scale/vs_cuda_kernel.cu
+++ b/starpurm/dev/cuda_vector_scale/vs_cuda_kernel.cu
@@ -59,4 +59,6 @@ extern "C" void vector_scale_cuda_func(void *cl_buffers[], void *cl_arg)
 
				 	unsigned nb_threads_per_block = 64;
			
 
				 	unsigned nb_blocks = (n + nb_threads_per_block-1) / nb_threads_per_block;
			
 
				 	vector_scale_cuda_kernel<<<nb_blocks,nb_threads_per_block,0,starpu_cuda_get_local_stream()>>>(vector, n, scalar);
			
 
				+	cudaError_t status = cudaGetLastError();
			
 
				+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
			
 
				 }
			
--- a/tests/datawizard/acquire_release_cuda.cu
+++ b/tests/datawizard/acquire_release_cuda.cu
@@ -26,4 +26,6 @@ extern "C" void increment_cuda(void *descr[], void *cl_arg)
 
				 	unsigned *val = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				 
			
 
				 	_increment_cuda_codelet<<<1,1, 0, starpu_cuda_get_local_stream()>>>(val);
			
 
				+	cudaError_t status = cudaGetLastError();
			
 
				+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
			
 
				 }
			
--- a/tests/datawizard/cuda_codelet_unsigned_inc.cu
+++ b/tests/datawizard/cuda_codelet_unsigned_inc.cu
@@ -27,4 +27,6 @@ extern "C" void cuda_codelet_unsigned_inc(void *descr[], void *cl_arg)
 
				 	unsigned *val = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				 
			
 
				 	_cuda_unsigned_inc<<<1,1, 0, starpu_cuda_get_local_stream()>>>(val);
			
 
				+	cudaError_t status = cudaGetLastError();
			
 
				+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
			
 
				 }
			
--- a/tests/datawizard/interfaces/bcsr/bcsr_cuda.cu
+++ b/tests/datawizard/interfaces/bcsr/bcsr_cuda.cu
@@ -60,6 +60,8 @@ extern "C" void test_bcsr_cuda_func(void *buffers[], void *args)
 
				 
			
 
				         bcsr_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>>
			
 
				 		(val, nnz, ret, factor);
			
 
				+	error = cudaGetLastError();
			
 
				+	if (error != cudaSuccess) STARPU_CUDA_REPORT_ERROR(error);
			
 
				 
			
 
				 	error = cudaMemcpyAsync(&bcsr_config.copy_failed,
			
 
				 			   ret,
			
--- a/tests/datawizard/interfaces/block/block_cuda.cu
+++ b/tests/datawizard/interfaces/block/block_cuda.cu
@@ -70,6 +70,8 @@ extern "C" void test_block_cuda_func(void *buffers[], void *args)
 
				 
			
 
				         block_cuda<<<1,1, 0, starpu_cuda_get_local_stream()>>>
			
 
				 		(block, nx, ny, nz, ldy, ldz, factor, ret);
			
 
				+	error = cudaGetLastError();
			
 
				+	if (error != cudaSuccess) STARPU_CUDA_REPORT_ERROR(error);
			
 
				 	error = cudaMemcpyAsync(&block_config.copy_failed, ret, sizeof(int), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
			
 
				 	if (error != cudaSuccess)
			
 
				 		STARPU_CUDA_REPORT_ERROR(error);
			
--- a/tests/datawizard/interfaces/coo/coo_cuda.cu
+++ b/tests/datawizard/interfaces/coo/coo_cuda.cu
@@ -57,6 +57,8 @@ extern "C" void test_coo_cuda_func(void *buffers[], void *args)
 
				 
			
 
				         coo_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>>
			
 
				 		(val, nvalues, ret, factor);
			
 
				+	error = cudaGetLastError();
			
 
				+	if (error != cudaSuccess) STARPU_CUDA_REPORT_ERROR(error);
			
 
				 
			
 
				 	error = cudaMemcpyAsync(&coo_config.copy_failed,
			
 
				 			   ret,
			
--- a/tests/datawizard/interfaces/csr/csr_cuda.cu
+++ b/tests/datawizard/interfaces/csr/csr_cuda.cu
@@ -56,6 +56,8 @@ extern "C" void test_csr_cuda_func(void *buffers[], void *args)
 
				 		STARPU_CUDA_REPORT_ERROR(error);
			
 
				 
			
 
				         csr_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>> (val, nnz, ret, factor);
			
 
				+	error = cudaGetLastError();
			
 
				+	if (error != cudaSuccess) STARPU_CUDA_REPORT_ERROR(error);
			
 
				 
			
 
				 	error = cudaMemcpyAsync(&csr_config.copy_failed,
			
 
				 			   ret,
			
--- a/tests/datawizard/interfaces/matrix/matrix_cuda.cu
+++ b/tests/datawizard/interfaces/matrix/matrix_cuda.cu
@@ -59,6 +59,8 @@ extern "C" void test_matrix_cuda_func(void *buffers[], void *args)
 
				 		STARPU_CUDA_REPORT_ERROR(error);
			
 
				 
			
 
				         matrix_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>>(val, n, ret, factor);
			
 
				+	error = cudaGetLastError();
			
 
				+	if (error != cudaSuccess) STARPU_CUDA_REPORT_ERROR(error);
			
 
				 
			
 
				 	error = cudaMemcpyAsync(&matrix_config.copy_failed,
			
 
				 			   ret,
			
--- a/tests/datawizard/interfaces/multiformat/multiformat_conversion_codelets_cuda.cu
+++ b/tests/datawizard/interfaces/multiformat/multiformat_conversion_codelets_cuda.cu
@@ -45,4 +45,6 @@ extern "C" void cpu_to_cuda_cuda_func(void *buffers[], void *_args)
 
				 	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
			
 
				 
			
 
				         cpu_to_cuda_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>>(src, dst, n);
			
 
				+        cudaError_t status = cudaGetLastError();
			
 
				+        if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
			
 
				 }
			
--- a/tests/datawizard/interfaces/multiformat/multiformat_cuda.cu
+++ b/tests/datawizard/interfaces/multiformat/multiformat_cuda.cu
@@ -65,6 +65,8 @@ extern "C" void test_multiformat_cuda_func(void *buffers[], void *args)
 
				 		STARPU_CUDA_REPORT_ERROR(error);
			
 
				 
			
 
				         multiformat_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>>(soa, n, ret, factor);
			
 
				+	error = cudaGetLastError();
			
 
				+	if (error != cudaSuccess) STARPU_CUDA_REPORT_ERROR(error);
			
 
				 
			
 
				 	error = cudaMemcpyAsync(&multiformat_config.copy_failed,
			
 
				 			   ret,
			
--- a/tests/datawizard/interfaces/tensor/tensor_cuda.cu
+++ b/tests/datawizard/interfaces/tensor/tensor_cuda.cu
@@ -75,6 +75,8 @@ extern "C" void test_tensor_cuda_func(void *buffers[], void *args)
 
				 
			
 
				         tensor_cuda<<<1,1, 0, starpu_cuda_get_local_stream()>>>
			
 
				 		(tensor, nx, ny, nz, nt, ldy, ldz, ldt, factor, ret);
			
 
				+	error = cudaGetLastError();
			
 
				+	if (error != cudaSuccess) STARPU_CUDA_REPORT_ERROR(error);
			
 
				 	error = cudaMemcpyAsync(&tensor_config.copy_failed, ret, sizeof(int), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
			
 
				 	if (error != cudaSuccess)
			
 
				 		STARPU_CUDA_REPORT_ERROR(error);
			
--- a/tests/datawizard/interfaces/variable/variable_cuda.cu
+++ b/tests/datawizard/interfaces/variable/variable_cuda.cu
@@ -55,6 +55,8 @@ extern "C" void test_variable_cuda_func(void *buffers[], void *args)
 
				 	unsigned nblocks = 1;
			
 
				 
			
 
				         variable_cuda<<<nblocks,threads_per_block,0,starpu_cuda_get_local_stream()>>>(val, ret, factor);
			
 
				+	error = cudaGetLastError();
			
 
				+	if (error != cudaSuccess) STARPU_CUDA_REPORT_ERROR(error);
			
 
				 	error = cudaMemcpyAsync(&variable_config.copy_failed,
			
 
				 			   ret,
			
 
				 			   sizeof(int),
			
--- a/tests/datawizard/interfaces/vector/vector_cuda.cu
+++ b/tests/datawizard/interfaces/vector/vector_cuda.cu
@@ -55,6 +55,8 @@ extern "C" void test_vector_cuda_func(void *buffers[], void *args)
 
				 	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
			
 
				 
			
 
				         framework_cuda<<<nblocks,threads_per_block,0,starpu_cuda_get_local_stream()>>>(val, n, ret, factor);
			
 
				+	error = cudaGetLastError();
			
 
				+	if (error != cudaSuccess) STARPU_CUDA_REPORT_ERROR(error);
			
 
				 	error = cudaMemcpyAsync(&vector_config.copy_failed, ret, sizeof(int), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
			
 
				 	if (error != cudaSuccess)
			
 
				 	{
			
--- a/tests/datawizard/scal_cuda.cu
+++ b/tests/datawizard/scal_cuda.cu
@@ -32,4 +32,6 @@ extern "C" void scal_func_cuda(void *buffers[], void *_args)
 
				 	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
			
 
				 
			
 
				         vector_mult_cuda<<<nblocks,threads_per_block,0,starpu_cuda_get_local_stream()>>>(val, n);
			
 
				+	cudaError_t status = cudaGetLastError();
			
 
				+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
			
 
				 }
			
--- a/tests/datawizard/scratch_cuda.cu
+++ b/tests/datawizard/scratch_cuda.cu
@@ -46,4 +46,6 @@ extern "C" void cuda_f(void *descr[], void *_args)
 
				 	unsigned nthread_per_block = STARPU_MIN(MAXTHREADSPERBLOCK, (nx / nblocks));
			
 
				 
			
 
				 	increment_vector<<<nblocks, nthread_per_block, 0, starpu_cuda_get_local_stream()>>>(v, tmp, nx);
			
 
				+	cudaError_t status = cudaGetLastError();
			
 
				+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
			
 
				 }
			
--- a/tests/datawizard/sync_and_notify_data_kernels.cu
+++ b/tests/datawizard/sync_and_notify_data_kernels.cu
@@ -30,6 +30,8 @@ extern "C" void cuda_codelet_incA(void *descr[], void *_args)
 
				 	unsigned *v = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				 
			
 
				 	_cuda_incA<<<1,1, 0, starpu_cuda_get_local_stream()>>>(v);
			
 
				+	cudaError_t status = cudaGetLastError();
			
 
				+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -46,4 +48,6 @@ extern "C" void cuda_codelet_incC(void *descr[], void *_args)
 
				 	unsigned *v = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
			
 
				 
			
 
				 	_cuda_incC<<<1,1, 0, starpu_cuda_get_local_stream()>>>(v);
			
 
				+	cudaError_t status = cudaGetLastError();
			
 
				+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
			
 
				 }
			
--- a/tests/main/increment.cu
+++ b/tests/main/increment.cu
@@ -27,4 +27,6 @@ extern "C" void cuda_host_increment(void *descr[], void *_args)
 
				 	unsigned *var = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
			
 
				 
			
 
				 	cuda_increment<<<1,1, 0, starpu_cuda_get_local_stream()>>>(var);
			
 
				+	cudaError_t status = cudaGetLastError();
			
 
				+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
			
 
				 }
			
--- a/tests/overlap/long_kernel.cu
+++ b/tests/overlap/long_kernel.cu
@@ -29,4 +29,6 @@ extern "C" void long_kernel_cuda(unsigned long niters)
 
				 	dim3 dimBlock(1,1);
			
 
				 	dim3 dimGrid(1,1);
			
 
				 	long_kernel<<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>>(niters);
			
 
				+	cudaError_t status = cudaGetLastError();
			
 
				+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
			
 
				 }
			
--- a/tools/gdbinit
+++ b/tools/gdbinit
@@ -189,6 +189,9 @@ define starpu-workers
 
				     if $worker->status == STATUS_WAITING
			
 
				       set $status="WAITING"
			
 
				     end
			
 
				+    if $worker->status == STATUS_SLEEPING_SCHEDULING
			
 
				+      set $status="SLEEPING_SCHEDULING"
			
 
				+    end
			
 
				     if $worker->status == STATUS_SLEEPING
			
 
				       set $status="SLEEPING"
			
 
				     end
			
@@ -635,7 +638,11 @@ define starpu-print-requests-tree
 
				 end
			
 
				 
			
 
				 define starpu-print-requests-list
			
 
				-  starpu-print-requests-tree $arg0.tree.root
			
 
				+  if _starpu_debug
			
 
				+    starpu-print-requests-treelist &$arg0.list
			
 
				+  else
			
 
				+    starpu-print-requests-tree $arg0.tree.root
			
 
				+  end
			
 
				 end
			
 
				 
			
 
				 define starpu-print-requests