Bläddra i källkod

Merge remote-tracking branch 'origin/master' into ft_checkpoint

Romain LION 5 år sedan
förälder
incheckning
8d264c88b3
72 ändrade filer med 309 tillägg och 100 borttagningar
  1. 2 0
      ChangeLog
  2. 3 0
      configure.ac
  3. 2 0
      doc/doxygen/chapters/210_check_list_performance.doxy
  4. 2 0
      doc/doxygen/chapters/code/vector_scal_cuda.c
  5. 2 0
      doc/tutorial/vector_scal_cuda.cu
  6. 2 0
      examples/basic_examples/block_cuda.cu
  7. 2 0
      examples/basic_examples/multiformat_conversion_codelets_cuda.cu
  8. 2 0
      examples/basic_examples/multiformat_cuda.cu
  9. 2 0
      examples/basic_examples/variable_kernels.cu
  10. 2 0
      examples/basic_examples/vector_scal_cuda.cu
  11. 2 0
      examples/filters/custom_mf/conversion.cu
  12. 2 0
      examples/filters/custom_mf/cuda.cu
  13. 2 0
      examples/filters/fblock_cuda.cu
  14. 4 0
      examples/filters/fmultiple_cuda.cu
  15. 2 0
      examples/incrementer/incrementer_kernels.cu
  16. 2 0
      examples/interface/complex_kernels.cu
  17. 2 0
      examples/pi/SobolQRNG/sobol_gpu.cu
  18. 4 0
      examples/pi/pi_kernel.cu
  19. 4 0
      examples/pi/pi_redux_kernel.cu
  20. 2 0
      examples/reductions/dot_product_kernels.cu
  21. 2 0
      examples/sched_ctx/axpy_partition_gpu.cu
  22. 2 0
      examples/spmv/spmv_cuda.cu
  23. 2 0
      examples/stencil/life_cuda.cu
  24. 2 0
      examples/stencil/shadow.cu
  25. 2 0
      julia/examples/mult/gpu_mult.cu
  26. 2 0
      julia/examples/old_examples/gpu_mult.cu
  27. 2 0
      julia/examples/old_examples/mandelbrot/gpu_mandelbrot.cu
  28. 2 0
      julia/examples/old_examples/mandelbrot/gpu_mandelbrot_between.cu
  29. 2 0
      julia/examples/old_examples/mult/gpu_mult.cu
  30. 4 0
      julia/examples/old_examples/nbody/gpu_nbody.cu
  31. 2 0
      julia/examples/old_examples/nbody/gpu_nbody_between.cu
  32. 4 0
      julia/src/compiler/expressions.jl
  33. 2 0
      mpi/tests/ring_kernel.cu
  34. 1 0
      src/core/jobs.c
  35. 2 4
      src/core/sched_policy.c
  36. 51 14
      src/datawizard/coherency.c
  37. 2 2
      src/datawizard/coherency.h
  38. 2 2
      src/datawizard/copy_driver.c
  39. 10 1
      src/datawizard/copy_driver.h
  40. 44 29
      src/datawizard/data_request.c
  41. 7 8
      src/datawizard/data_request.h
  42. 1 1
      src/datawizard/filters.c
  43. 1 1
      src/datawizard/interfaces/data_interface.c
  44. 23 18
      src/datawizard/memalloc.c
  45. 9 1
      src/datawizard/memalloc.h
  46. 6 6
      src/datawizard/user_interactions.c
  47. 1 1
      src/datawizard/write_back.c
  48. 2 2
      src/debug/latency.c
  49. 8 6
      src/debug/traces/starpu_fxt.c
  50. 2 0
      src/debug/traces/starpu_paje.c
  51. 2 2
      src/drivers/cuda/driver_cuda.c
  52. 0 1
      src/sched_policies/component_sched.c
  53. 12 0
      starpufft/src/cudax_kernels.cu
  54. 2 0
      starpurm/dev/cuda_vector_scale/vs_cuda_kernel.cu
  55. 2 0
      tests/datawizard/acquire_release_cuda.cu
  56. 2 0
      tests/datawizard/cuda_codelet_unsigned_inc.cu
  57. 2 0
      tests/datawizard/interfaces/bcsr/bcsr_cuda.cu
  58. 2 0
      tests/datawizard/interfaces/block/block_cuda.cu
  59. 2 0
      tests/datawizard/interfaces/coo/coo_cuda.cu
  60. 2 0
      tests/datawizard/interfaces/csr/csr_cuda.cu
  61. 2 0
      tests/datawizard/interfaces/matrix/matrix_cuda.cu
  62. 2 0
      tests/datawizard/interfaces/multiformat/multiformat_conversion_codelets_cuda.cu
  63. 2 0
      tests/datawizard/interfaces/multiformat/multiformat_cuda.cu
  64. 2 0
      tests/datawizard/interfaces/tensor/tensor_cuda.cu
  65. 2 0
      tests/datawizard/interfaces/variable/variable_cuda.cu
  66. 2 0
      tests/datawizard/interfaces/vector/vector_cuda.cu
  67. 2 0
      tests/datawizard/scal_cuda.cu
  68. 2 0
      tests/datawizard/scratch_cuda.cu
  69. 4 0
      tests/datawizard/sync_and_notify_data_kernels.cu
  70. 2 0
      tests/main/increment.cu
  71. 2 0
      tests/overlap/long_kernel.cu
  72. 8 1
      tools/gdbinit

+ 2 - 0
ChangeLog

@@ -39,6 +39,8 @@ New features:
     its OS index.
   * New function starpu_get_hwloc_topology() to get a copy of the hwloc
     topology used by StarPU.
+  * Add a task prefetch level, to improve retaining data in accelerators so we
+    can make prefetch more aggressive.
 
 Small changes:
   * Use the S4U interface of Simgrid instead of xbt and MSG.

+ 3 - 0
configure.ac

@@ -1382,6 +1382,9 @@ if test x$enable_cuda = xyes; then
 	    ]
 	)
 	if test x$have_valid_nvml = xyes ; then
+		AC_CHECK_DECLS([nvmlDeviceGetTotalEnergyConsumption], [
+			AC_CHECK_FUNCS([nvmlDeviceGetTotalEnergyConsumption])
+			], [], [[#include <nvml.h>]])
 		AC_DEFINE([HAVE_LIBNVIDIA_ML], [1], [Define to 1 if you have the nvidia-ml library])
 		STARPU_CUDA_LDFLAGS="$STARPU_CUDA_LDFLAGS -lnvidia-ml"
 	fi

+ 2 - 0
doc/doxygen/chapters/210_check_list_performance.doxy

@@ -91,6 +91,8 @@ operations to avoid this issue. For instance:
 
 \code{.c}
 func <<<grid,block,0,starpu_cuda_get_local_stream()>>> (foo, bar);
+cudaError_t status = cudaGetLastError();
+if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 cudaStreamSynchronize(starpu_cuda_get_local_stream());
 \endcode
 

+ 2 - 0
doc/doxygen/chapters/code/vector_scal_cuda.c

@@ -35,6 +35,8 @@ extern "C" void scal_cuda_func(void *buffers[], void *_args)
         unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
 
         vector_mult_cuda<<<nblocks,threads_per_block, 0, starpu_cuda_get_local_stream()>>>(n, val, *factor);
+        cudaError_t status = cudaGetLastError();
+        if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 
         cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }

+ 2 - 0
doc/tutorial/vector_scal_cuda.cu

@@ -35,6 +35,8 @@ extern "C" void vector_scal_cuda(void *buffers[], void *_args)
         unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
 
         vector_mult_cuda<<<nblocks,threads_per_block, 0, starpu_cuda_get_local_stream()>>>(val, n, *factor);
+        cudaError_t status = cudaGetLastError();
+        if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 
         cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }

+ 2 - 0
examples/basic_examples/block_cuda.cu

@@ -40,5 +40,7 @@ extern "C" void cuda_codelet(void *descr[], void *_args)
         float *multiplier = (float *)_args;
 
         cuda_block<<<1,1, 0, starpu_cuda_get_local_stream()>>>(block, nx, ny, nz, ldy, ldz, *multiplier);
+        cudaError_t status = cudaGetLastError();
+        if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }

+ 2 - 0
examples/basic_examples/multiformat_conversion_codelets_cuda.cu

@@ -44,4 +44,6 @@ extern "C" void cpu_to_cuda_cuda_func(void *buffers[], void *_args)
 	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
 
         cpu_to_cuda_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>>(src, dst, n);
+        cudaError_t status = cudaGetLastError();
+        if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 }

+ 2 - 0
examples/basic_examples/multiformat_cuda.cu

@@ -39,6 +39,8 @@ extern "C" void multiformat_scal_cuda_func(void *buffers[], void *_args)
 	unsigned threads_per_block = 64;
 	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
         multiformat_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>>(soa, n);
+        cudaError_t status = cudaGetLastError();
+        if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 
 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }

+ 2 - 0
examples/basic_examples/variable_kernels.cu

@@ -27,5 +27,7 @@ extern "C" void cuda_codelet(void *descr[], void *_args)
 	float *val = (float *)STARPU_VARIABLE_GET_PTR(descr[0]);
 
 	cuda_variable<<<1,1, 0, starpu_cuda_get_local_stream()>>>(val);
+	cudaError_t status = cudaGetLastError();
+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }

+ 2 - 0
examples/basic_examples/vector_scal_cuda.cu

@@ -41,4 +41,6 @@ extern "C" void scal_cuda_func(void *buffers[], void *_args)
 	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
 
         vector_mult_cuda<<<nblocks,threads_per_block,0,starpu_cuda_get_local_stream()>>>(n, val, *factor);
+        cudaError_t status = cudaGetLastError();
+        if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 }

+ 2 - 0
examples/filters/custom_mf/conversion.cu

@@ -45,4 +45,6 @@ extern "C" void cpu_to_cuda_cuda_func(void *buffers[], void *_args)
 	unsigned threads_per_block = 64;
 	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
         custom_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>>(aop, n, x, y);
+        cudaError_t status = cudaGetLastError();
+        if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 }

+ 2 - 0
examples/filters/custom_mf/cuda.cu

@@ -39,4 +39,6 @@ extern "C" void custom_scal_cuda_func(void *buffers[], void *_args)
 	unsigned threads_per_block = 64;
 	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
         scal_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>>(n, x, y);
+        cudaError_t status = cudaGetLastError();
+        if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 }

+ 2 - 0
examples/filters/fblock_cuda.cu

@@ -43,4 +43,6 @@ extern "C" void cuda_func(void *buffers[], void *_args)
 
         /* TODO: use more blocks and threads in blocks */
         fblock_cuda<<<1,1, 0, starpu_cuda_get_local_stream()>>>(block, nx, ny, nz, ldy, ldz, *factor);
+        cudaError_t status = cudaGetLastError();
+        if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 }

+ 4 - 0
examples/filters/fmultiple_cuda.cu

@@ -44,6 +44,8 @@ extern "C" void fmultiple_check_scale_cuda(void *buffers[], void *cl_arg)
 
         /* TODO: use more vals and threads in vals */
 	_fmultiple_check_scale_cuda<<<1,1, 0, starpu_cuda_get_local_stream()>>>(val, nx, ny, ld, start, factor);
+	cudaError_t status = cudaGetLastError();
+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 }
 
 static __global__ void _fmultiple_check_cuda(int *val, int nx, int ny, unsigned ld, int start, int factor)
@@ -71,4 +73,6 @@ extern "C" void fmultiple_check_cuda(void *buffers[], void *cl_arg)
 
         /* TODO: use more vals and threads in vals */
 	_fmultiple_check_cuda<<<1,1, 0, starpu_cuda_get_local_stream()>>>(val, nx, ny, ld, start, factor);
+	cudaError_t status = cudaGetLastError();
+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 }

+ 2 - 0
examples/incrementer/incrementer_kernels.cu

@@ -32,4 +32,6 @@ extern "C" void cuda_codelet(void *descr[], void *_args)
 	float *val = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
 
 	cuda_incrementer<<<1,1, 0, starpu_cuda_get_local_stream()>>>(val);
+	cudaError_t status = cudaGetLastError();
+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 }

+ 2 - 0
examples/interface/complex_kernels.cu

@@ -44,4 +44,6 @@ extern "C" void copy_complex_codelet_cuda(void *descr[], void *_args)
 	unsigned nblocks = (nx + threads_per_block-1) / threads_per_block;
 
         complex_copy_cuda<<<nblocks, threads_per_block, 0, starpu_cuda_get_local_stream()>>>(o_real, o_imaginary, i_real, i_imaginary, nx);
+        cudaError_t status = cudaGetLastError();
+        if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 }

+ 2 - 0
examples/pi/SobolQRNG/sobol_gpu.cu

@@ -165,4 +165,6 @@ extern "C" void sobolGPU(int n_vectors, int n_dimensions, unsigned int *d_direct
 
     // Execute GPU kernel
     sobolGPU_kernel<<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>>(n_vectors, n_dimensions, d_directions, d_output);
+    cudaError_t status = cudaGetLastError();
+    if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 }

+ 4 - 0
examples/pi/pi_kernel.cu

@@ -137,12 +137,16 @@ extern "C" void cuda_kernel(void *descr[], void *cl_arg)
 	/* each entry of per_block_cnt contains the number of successful shots
 	 * in the corresponding block. */
 	monte_carlo<<<nblocks, nthread_per_block, 0, starpu_cuda_get_local_stream()>>>(random_numbers_x, random_numbers_y, nx, per_block_cnt);
+	cures = cudaGetLastError();
+	if (cures != cudaSuccess) STARPU_CUDA_REPORT_ERROR(cures);
 
 	/* Note that we do not synchronize between kernel calls because there is an implicit serialization */
 
 	/* compute the total number of successful shots by adding the elements
 	 * of the per_block_cnt array */
 	sum_per_block_cnt<<<1, nblocks, 0, starpu_cuda_get_local_stream()>>>(per_block_cnt, cnt);
+	cures = cudaGetLastError();
+	if (cures != cudaSuccess) STARPU_CUDA_REPORT_ERROR(cures);
 	cures = cudaStreamSynchronize(starpu_cuda_get_local_stream());
 	if (cures)
 		STARPU_CUDA_REPORT_ERROR(cures);

+ 4 - 0
examples/pi/pi_redux_kernel.cu

@@ -115,12 +115,16 @@ extern "C" void pi_redux_cuda_kernel(float *x, float *y, unsigned n, unsigned lo
 	/* each entry of per_block_cnt contains the number of successful shots
 	 * in the corresponding block. */
 	monte_carlo<<<nblocks, nthread_per_block, 0, starpu_cuda_get_local_stream()>>>(x, y, n, per_block_cnt);
+	cures = cudaGetLastError();
+	if (cures != cudaSuccess) STARPU_CUDA_REPORT_ERROR(cures);
 
 	/* Note that we do not synchronize between kernel calls because there is an implicit serialization */
 
 	/* compute the total number of successful shots by adding the elements
 	 * of the per_block_cnt array */
 	sum_per_block_cnt<<<1, nblocks, 0, starpu_cuda_get_local_stream()>>>(per_block_cnt, shot_cnt);
+	cures = cudaGetLastError();
+	if (cures != cudaSuccess) STARPU_CUDA_REPORT_ERROR(cures);
 	cures = cudaStreamSynchronize(starpu_cuda_get_local_stream());
 	if (cures)
 		STARPU_CUDA_REPORT_ERROR(cures);

+ 2 - 0
examples/reductions/dot_product_kernels.cu

@@ -33,4 +33,6 @@ extern "C" void redux_cuda_func(void *descr[], void *_args)
 	DOT_TYPE *dotb = (DOT_TYPE *)STARPU_VARIABLE_GET_PTR(descr[1]);
 
 	cuda_redux<<<1,1, 0, starpu_cuda_get_local_stream()>>>(dota, dotb);
+	cudaError_t status = cudaGetLastError();
+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 }

+ 2 - 0
examples/sched_ctx/axpy_partition_gpu.cu

@@ -73,4 +73,6 @@ extern "C" void cuda_axpy(void *descr[], void *_args)
 	__P_HOSTSETUP(saxpy_partitioned,dim3(dimensions,1,1),dimensions,0,SM_mapping_start,SM_allocation,stream);
 
   	saxpy_partitioned<<<width,dimensions,0,stream>>>(__P_HKARGS,n,a,x,y);
+	cudaError_t status = cudaGetLastError();
+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 }

+ 2 - 0
examples/spmv/spmv_cuda.cu

@@ -97,6 +97,8 @@ extern "C" void spmv_kernel_cuda(void *descr[], void *args)
 
 	spmv_kernel_3<<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>>
 		(nnz, nrow, nzval, colind, rowptr, firstentry, vecin, nx_in, vecout, nx_out);
+	cudaError_t status = cudaGetLastError();
+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 }
 
 

+ 2 - 0
examples/stencil/life_cuda.cu

@@ -73,4 +73,6 @@ extern "C" void cuda_life_update_host(int bz, const TYPE *old, TYPE *newp, int n
 	dim3 dimGrid((nx + threads_per_dim_x-1) / threads_per_dim_x, (ny + threads_per_dim_y-1) / threads_per_dim_y);
 #endif
 	cuda_life_update <<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>> (bz, old, newp, nx, ny, nz, ldy, ldz, iter);
+	cudaError_t status = cudaGetLastError();
+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 }

+ 2 - 0
examples/stencil/shadow.cu

@@ -53,4 +53,6 @@ extern "C" void cuda_shadow_host(int bz, TYPE *ptr, int nx, int ny, int nz, int
 	dim3 dimGrid((nx + threads_per_dim_x-1) / threads_per_dim_x, (ny + threads_per_dim_y-1) / threads_per_dim_y);
 #endif
 	cuda_shadow <<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>> (bz, ptr, nx, ny, nz, ldy, ldz, i);
+	cudaError_t status = cudaGetLastError();
+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 }

+ 2 - 0
julia/examples/mult/gpu_mult.cu

@@ -79,6 +79,8 @@ extern "C" void gpu_mult(void * descr[], void * args)
 	gpuMultKernel
 		<<< nblocks, THREADS_PER_BLOCK, 0, NULL /*starpu_cuda_get_local_stream()*/
 		>>> (nxC, nyC, nyA, ldA, ldB, ldC, d_subA, d_subB, d_subC);
+	cudaError_t status = cudaGetLastError();
+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 
 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 

+ 2 - 0
julia/examples/old_examples/gpu_mult.cu

@@ -78,6 +78,8 @@ extern "C" void gpu_mult(void * descr[], void * args)
 	gpuMultKernel
 		<<< nblocks, THREADS_PER_BLOCK, 0, starpu_cuda_get_local_stream()
 		>>> (nxC, nyC, nyA, ldA, ldB, ldC, d_subA, d_subB, d_subC);
+	cudaError_t status = cudaGetLastError();
+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 
 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 

+ 2 - 0
julia/examples/old_examples/mandelbrot/gpu_mandelbrot.cu

@@ -106,6 +106,8 @@ extern "C" void gpu_mandelbrot(void *descr[], void *args)
   nblocks = (nxP * nyP + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK;
 
   gpuMandelbrotKernel <<< nblocks, THREADS_PER_BLOCK, 0, starpu_cuda_get_local_stream() >>> (nxP, nyP, ldP, d_subP, *params);
+  cudaError_t status = cudaGetLastError();
+  if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 
   cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }

+ 2 - 0
julia/examples/old_examples/mandelbrot/gpu_mandelbrot_between.cu

@@ -123,6 +123,8 @@ extern "C" void CUDA_mandelbrot(void** buffers_uwrYFDVe, void* cl_arg_uwrYFDVe)
              ptr_qoUGBRtY, local_height, conv_limit, ptr_A5zD9sJZ, 
              ld_A5zD9sJZ);
     ;
+    cudaError_t status = cudaGetLastError();
+    if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
     cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 

+ 2 - 0
julia/examples/old_examples/mult/gpu_mult.cu

@@ -78,6 +78,8 @@ extern "C" void gpu_mult(void * descr[], void * args)
 	gpuMultKernel
 		<<< nblocks, THREADS_PER_BLOCK, 0, starpu_cuda_get_local_stream()
 		>>> (nxC, nyC, nyA, ldA, ldB, ldC, d_subA, d_subB, d_subC);
+	cudaError_t status = cudaGetLastError();
+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 
 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 

+ 4 - 0
julia/examples/old_examples/nbody/gpu_nbody.cu

@@ -94,6 +94,8 @@ extern "C" void gpu_nbody(void * descr[], void * args)
   gpuNbodyKernel
     <<< nblocks, THREADS_PER_BLOCK, 0, starpu_cuda_get_local_stream()
     >>> (d_P,  d_subA, d_M, nxP, nxA, nxM, ldP, ldA, *params);
+  cudaError_t status = cudaGetLastError();
+  if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 
   cudaStreamSynchronize(starpu_cuda_get_local_stream());
 
@@ -156,6 +158,8 @@ extern "C" void gpu_nbody2(void * descr[], void *args)
   gpuNbody2Kernel
     <<< nblocks, THREADS_PER_BLOCK, 0, starpu_cuda_get_local_stream()
     >>> (d_subP, d_subV, d_subA, nxP, nxV, nxA, ldP, ldV, ldA, *params);
+  cudaError_t status = cudaGetLastError();
+  if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 
   cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }

+ 2 - 0
julia/examples/old_examples/nbody/gpu_nbody_between.cu

@@ -161,6 +161,8 @@ extern "C" void CUDA_nbody_updt(void** buffers_gj6UYWT4, void* cl_arg_gj6UYWT4)
              ld_jJ5f8wMA, ptr_piPvdbTs, ld_piPvdbTs, ptr_JBaPgPiT, 
              ptr_0STm2S4k, ld_0STm2S4k);
     ;
+    cudaError_t status = cudaGetLastError();
+    if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
     cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 

+ 4 - 0
julia/src/compiler/expressions.jl

@@ -335,6 +335,10 @@ function print(io :: IO, expr :: StarpuExprCudaCall ; indent = 0,restrict=false)
 
     print(io, ");")
     print_newline(io, indent)
+    print(io, "cudaError_t status = cudaGetLastError();")
+    print_newline(io, indent)
+    print(io, "if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);")
+    print_newline(io, indent)
 
 end
 

+ 2 - 0
mpi/tests/ring_kernel.cu

@@ -27,5 +27,7 @@ extern "C" void increment_cuda(void *descr[], void *_args)
 	int *tokenptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
 
 	cuda_incrementer<<<1,1, 0, starpu_cuda_get_local_stream()>>>(tokenptr);
+	cudaError_t status = cudaGetLastError();
+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }

+ 1 - 0
src/core/jobs.c

@@ -485,6 +485,7 @@ void _starpu_handle_job_termination(struct _starpu_job *j)
 		 * also the callback were executed. */
 		j->terminated = 2;
 	}
+	task->prefetched = 0;
 	STARPU_PTHREAD_COND_BROADCAST(&j->sync_cond);
 	STARPU_AYU_REMOVETASK(j->job_id);
 	STARPU_PTHREAD_MUTEX_UNLOCK(&j->sync_mutex);

+ 2 - 4
src/core/sched_policy.c

@@ -570,9 +570,6 @@ int _starpu_push_task_to_workers(struct starpu_task *task)
 	int ret = 0;
 	if (STARPU_UNLIKELY(task->execute_on_a_specific_worker))
 	{
-		if (starpu_get_prefetch_flag())
-			starpu_prefetch_task_input_for(task, task->workerid);
-
 		ret = _starpu_push_task_on_specific_worker(task, task->workerid);
 	}
 	else
@@ -582,7 +579,8 @@ int _starpu_push_task_to_workers(struct starpu_task *task)
 		/* When a task can only be executed on a given arch and we have
 		 * only one memory node for that arch, we can systematically
 		 * prefetch before the scheduling decision. */
-		if (starpu_get_prefetch_flag() && starpu_memory_nodes_get_count() > 1)
+		/* XXX: no, this is definitely not a good idea if the scheduler reorders tasks... */
+		if (0 && starpu_get_prefetch_flag() && starpu_memory_nodes_get_count() > 1)
 		{
 			if (task->where == STARPU_CPU && config->cpus_nodeid >= 0)
 				starpu_prefetch_task_input_on_node(task, config->cpus_nodeid);

+ 51 - 14
src/datawizard/coherency.c

@@ -404,7 +404,7 @@ int _starpu_determine_request_path(starpu_data_handle_t handle,
 /* handle->lock should be taken. r is returned locked. The node parameter
  * indicate either the source of the request, or the destination for a
  * write-only request. */
-static struct _starpu_data_request *_starpu_search_existing_data_request(struct _starpu_data_replicate *replicate, unsigned node, enum starpu_data_access_mode mode, unsigned is_prefetch)
+static struct _starpu_data_request *_starpu_search_existing_data_request(struct _starpu_data_replicate *replicate, unsigned node, enum starpu_data_access_mode mode, enum _starpu_is_prefetch is_prefetch)
 {
 	struct _starpu_data_request *r;
 
@@ -467,7 +467,7 @@ static struct _starpu_data_request *_starpu_search_existing_data_request(struct
 
 struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_handle_t handle,
 								  struct _starpu_data_replicate *dst_replicate,
-								  enum starpu_data_access_mode mode, unsigned is_prefetch,
+								  enum starpu_data_access_mode mode, enum _starpu_is_prefetch is_prefetch,
 								  unsigned async,
 								  void (*callback_func)(void *), void *callback_arg, int prio, const char *origin)
 {
@@ -522,7 +522,13 @@ struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_ha
 #endif
 
 			if (dst_replicate->mc)
+			{
+				if (is_prefetch == STARPU_TASK_PREFETCH)
+					/* Make sure it stays there */
+					dst_replicate->mc->nb_tasks_prefetch++;
+
 				_starpu_memchunk_recently_used(dst_replicate->mc, requesting_node);
+			}
 		}
 
 		_starpu_spin_unlock(&handle->header_lock);
@@ -567,6 +573,9 @@ struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_ha
 			if (_starpu_allocate_memory_on_node(handle, dst_replicate, is_prefetch) == 0)
 			{
 				_starpu_update_data_state(handle, dst_replicate, mode);
+				if (is_prefetch == STARPU_TASK_PREFETCH)
+					/* Make sure it stays there */
+					dst_replicate->mc->nb_tasks_prefetch++;
 
 				_starpu_spin_unlock(&handle->header_lock);
 
@@ -645,9 +654,17 @@ struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_ha
 				STARPU_ASSERT(r->next_req_count <= STARPU_MAXNODES);
 			}
 		}
-		else if (!write_invalidation)
-			/* The last request will perform the callback after termination */
-			_starpu_data_request_append_callback(r, callback_func, callback_arg);
+		else
+		{
+			if (is_prefetch == STARPU_TASK_PREFETCH)
+				/* Make last request add the prefetch count on the mc to keep the data
+				 * there until the task gets to execute.  */
+				r->nb_tasks_prefetch++;
+
+			if (!write_invalidation)
+				/* The last request will perform the callback after termination */
+				_starpu_data_request_append_callback(r, callback_func, callback_arg);
+		}
 
 
 		if (reused_requests[hop])
@@ -712,7 +729,7 @@ struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_ha
 }
 
 int _starpu_fetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *dst_replicate,
-			       enum starpu_data_access_mode mode, unsigned detached, unsigned is_prefetch, unsigned async,
+			       enum starpu_data_access_mode mode, unsigned detached, enum _starpu_is_prefetch is_prefetch, unsigned async,
 			       void (*callback_func)(void *), void *callback_arg, int prio, const char *origin)
 {
         _STARPU_LOG_IN();
@@ -744,6 +761,7 @@ int _starpu_fetch_data_on_node(starpu_data_handle_t handle, int node, struct _st
 		if (src_node_mask == 0)
 		{
 			/* no valid copy, nothing to prefetch */
+			_STARPU_DISP("Warning: no valid copy to prefetch?! that's not supposed to happen, please report\n");
 			_starpu_spin_unlock(&handle->header_lock);
 			return 0;
 		}
@@ -782,17 +800,22 @@ int _starpu_fetch_data_on_node(starpu_data_handle_t handle, int node, struct _st
 
 static int idle_prefetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate, enum starpu_data_access_mode mode, int prio)
 {
-	return _starpu_fetch_data_on_node(handle, node, replicate, mode, 1, 2, 1, NULL, NULL, prio, "idle_prefetch_data_on_node");
+	return _starpu_fetch_data_on_node(handle, node, replicate, mode, 1, STARPU_IDLEFETCH, 1, NULL, NULL, prio, "idle_prefetch_data_on_node");
 }
 
-static int prefetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate, enum starpu_data_access_mode mode, int prio)
+static int task_prefetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate, enum starpu_data_access_mode mode, int prio)
 {
-	return _starpu_fetch_data_on_node(handle, node, replicate, mode, 1, 1, 1, NULL, NULL, prio, "prefetch_data_on_node");
+	return _starpu_fetch_data_on_node(handle, node, replicate, mode, 1, STARPU_TASK_PREFETCH, 1, NULL, NULL, prio, "task_prefetch_data_on_node");
+}
+
+static int STARPU_ATTRIBUTE_UNUSED prefetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate, enum starpu_data_access_mode mode, int prio)
+{
+	return _starpu_fetch_data_on_node(handle, node, replicate, mode, 1, STARPU_PREFETCH, 1, NULL, NULL, prio, "prefetch_data_on_node");
 }
 
 static int fetch_data(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate, enum starpu_data_access_mode mode, int prio)
 {
-	return _starpu_fetch_data_on_node(handle, node, replicate, mode, 0, 0, 0, NULL, NULL, prio, "fetch_data");
+	return _starpu_fetch_data_on_node(handle, node, replicate, mode, 0, STARPU_FETCH, 0, NULL, NULL, prio, "fetch_data");
 }
 
 uint32_t _starpu_get_data_refcnt(starpu_data_handle_t handle, unsigned node)
@@ -911,10 +934,11 @@ int starpu_prefetch_task_input_on_node_prio(struct starpu_task *task, unsigned t
 		int node = _starpu_task_data_get_node_on_node(task, index, target_node);
 
 		struct _starpu_data_replicate *replicate = &handle->per_node[node];
-		prefetch_data_on_node(handle, node, replicate, mode, prio);
+		task_prefetch_data_on_node(handle, node, replicate, mode, prio);
 
 		_starpu_set_data_requested_flag_if_needed(handle, replicate);
 	}
+	task->prefetched = 1;
 
 	return 0;
 }
@@ -988,10 +1012,11 @@ int starpu_prefetch_task_input_for_prio(struct starpu_task *task, unsigned worke
 		int node = _starpu_task_data_get_node_on_worker(task, index, worker);
 
 		struct _starpu_data_replicate *replicate = &handle->per_node[node];
-		prefetch_data_on_node(handle, node, replicate, mode, prio);
+		task_prefetch_data_on_node(handle, node, replicate, mode, prio);
 
 		_starpu_set_data_requested_flag_if_needed(handle, replicate);
 	}
+	task->prefetched = 1;
 
 	return 0;
 }
@@ -1133,7 +1158,7 @@ int _starpu_fetch_task_input(struct starpu_task *task, struct _starpu_job *j, in
 
 		if (async)
 		{
-			ret = _starpu_fetch_data_on_node(handle, node, local_replicate, mode, 0, 0, 1,
+			ret = _starpu_fetch_data_on_node(handle, node, local_replicate, mode, 0, STARPU_FETCH, 1,
 					_starpu_fetch_task_input_cb, worker, 0, "_starpu_fetch_task_input");
 #ifdef STARPU_SIMGRID
 			if (_starpu_simgrid_fetching_input_cost())
@@ -1223,7 +1248,19 @@ void _starpu_fetch_task_input_tail(struct starpu_task *task, struct _starpu_job
 		local_replicate = get_replicate(handle, mode, workerid, node);
 		_starpu_spin_lock(&handle->header_lock);
 		if (local_replicate->mc)
+		{
 			local_replicate->mc->diduse = 1;
+			if (task->prefetched &&
+				!(mode & (STARPU_SCRATCH|STARPU_REDUX)) &&
+				(mode & STARPU_R))
+			{
+				/* Allocations or transfer prefetchs should have been done by now and marked
+				 * this mc as needed for us.
+				 * Now that we added a reference for the task, we can relieve that.  */
+				STARPU_ASSERT(local_replicate->mc->nb_tasks_prefetch > 0);
+				local_replicate->mc->nb_tasks_prefetch--;
+			}
+		}
 		_starpu_spin_unlock(&handle->header_lock);
 
 		_STARPU_TASK_SET_INTERFACE(task , local_replicate->data_interface, descrs[index].index);
@@ -1372,7 +1409,7 @@ void _starpu_fetch_nowhere_task_input(struct _starpu_job *j)
 
 		local_replicate = get_replicate(handle, mode, -1, node);
 
-		_starpu_fetch_data_on_node(handle, node, local_replicate, mode, 0, 0, 1, _starpu_fetch_nowhere_task_input_cb, wrapper, 0, "_starpu_fetch_nowhere_task_input");
+		_starpu_fetch_data_on_node(handle, node, local_replicate, mode, 0, STARPU_FETCH, 1, _starpu_fetch_nowhere_task_input_cb, wrapper, 0, "_starpu_fetch_nowhere_task_input");
 	}
 
 	if (profiling && task->profiling_info)

+ 2 - 2
src/datawizard/coherency.h

@@ -298,7 +298,7 @@ struct _starpu_data_state
  * async means that _starpu_fetch_data_on_node will wait for completion of the request
  */
 int _starpu_fetch_data_on_node(starpu_data_handle_t handle, int node, struct _starpu_data_replicate *replicate,
-			       enum starpu_data_access_mode mode, unsigned detached, unsigned is_prefetch, unsigned async,
+			       enum starpu_data_access_mode mode, unsigned detached, enum _starpu_is_prefetch is_prefetch, unsigned async,
 			       void (*callback_func)(void *), void *callback_arg, int prio, const char *origin);
 /** This releases a reference on the handle */
 void _starpu_release_data_on_node(struct _starpu_data_state *state, uint32_t default_wt_mask,
@@ -341,7 +341,7 @@ int _starpu_determine_request_path(starpu_data_handle_t handle,
  */
 struct _starpu_data_request *_starpu_create_request_to_fetch_data(starpu_data_handle_t handle,
 								  struct _starpu_data_replicate *dst_replicate,
-								  enum starpu_data_access_mode mode, unsigned is_prefetch,
+								  enum starpu_data_access_mode mode, enum _starpu_is_prefetch is_prefetch,
 								  unsigned async,
 								  void (*callback_func)(void *), void *callback_arg, int prio, const char *origin);
 

+ 2 - 2
src/datawizard/copy_driver.c

@@ -201,7 +201,7 @@ int STARPU_ATTRIBUTE_WARN_UNUSED_RESULT _starpu_driver_copy_data_1_to_1(starpu_d
 									unsigned donotread,
 									struct _starpu_data_request *req,
 									unsigned may_alloc,
-									unsigned prefetch STARPU_ATTRIBUTE_UNUSED)
+									enum _starpu_is_prefetch prefetch STARPU_ATTRIBUTE_UNUSED)
 {
 	if (!donotread)
 	{
@@ -219,7 +219,7 @@ int STARPU_ATTRIBUTE_WARN_UNUSED_RESULT _starpu_driver_copy_data_1_to_1(starpu_d
 			/* We're not supposed to allocate there at the moment */
 			return -ENOMEM;
 
-		int ret_alloc = _starpu_allocate_memory_on_node(handle, dst_replicate, req ? req->prefetch : 0);
+		int ret_alloc = _starpu_allocate_memory_on_node(handle, dst_replicate, req ? req->prefetch : STARPU_FETCH);
 		if (ret_alloc)
 			return -ENOMEM;
 	}

+ 10 - 1
src/datawizard/copy_driver.h

@@ -47,6 +47,15 @@ extern "C"
 struct _starpu_data_request;
 struct _starpu_data_replicate;
 
+enum _starpu_is_prefetch
+{
+	STARPU_FETCH = 0,		/* A task really needs it now! */
+	STARPU_TASK_PREFETCH = 1,	/* A task will need it soon */
+	STARPU_PREFETCH = 2,		/* It is a good idea to have it asap */
+	STARPU_IDLEFETCH = 3,		/* Get this here when you have time to */
+	STARPU_NFETCH
+};
+
 #ifdef STARPU_USE_MIC
 /** MIC needs memory_node to know which MIC is concerned.
  * mark is used to wait asynchronous request.
@@ -132,7 +141,7 @@ int _starpu_driver_copy_data_1_to_1(starpu_data_handle_t handle,
 				    unsigned donotread,
 				    struct _starpu_data_request *req,
 				    unsigned may_alloc,
-				    unsigned prefetch);
+				    enum _starpu_is_prefetch prefetch);
 
 unsigned _starpu_driver_test_request_completion(struct _starpu_async_channel *async_channel);
 void _starpu_driver_wait_request_completion(struct _starpu_async_channel *async_channel);

+ 44 - 29
src/datawizard/data_request.c

@@ -29,7 +29,7 @@
 #warning split into separate out/in queues for each node, so that MAX_PENDING_REQUESTS_PER_NODE is separate for them, since the links are bidirectionnal
 #endif
 static struct _starpu_data_request_prio_list data_requests[STARPU_MAXNODES];
-static struct _starpu_data_request_prio_list prefetch_requests[STARPU_MAXNODES];
+static struct _starpu_data_request_prio_list prefetch_requests[STARPU_MAXNODES]; /* Contains both task_prefetch and prefetch */
 static struct _starpu_data_request_prio_list idle_requests[STARPU_MAXNODES];
 static starpu_pthread_mutex_t data_requests_list_mutex[STARPU_MAXNODES];
 
@@ -124,7 +124,7 @@ struct _starpu_data_request *_starpu_create_data_request(starpu_data_handle_t ha
 							 int handling_node,
 							 enum starpu_data_access_mode mode,
 							 unsigned ndeps,
-							 unsigned is_prefetch,
+							 enum _starpu_is_prefetch is_prefetch,
 							 int prio,
 							 unsigned is_write_invalidation,
 							 const char *origin)
@@ -156,6 +156,7 @@ struct _starpu_data_request *_starpu_create_data_request(starpu_data_handle_t ha
 	STARPU_ASSERT(starpu_node_get_kind(handling_node) == STARPU_CPU_RAM || _starpu_memory_node_get_nworkers(handling_node));
 	r->completed = 0;
 	r->prefetch = is_prefetch;
+	r->nb_tasks_prefetch = 0;
 	r->prio = prio;
 	r->retval = -1;
 	r->ndeps = ndeps;
@@ -310,9 +311,9 @@ void _starpu_post_data_request(struct _starpu_data_request *r)
 
 	/* insert the request in the proper list */
 	STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[handling_node]);
-	if (r->prefetch == 2)
+	if (r->prefetch >= STARPU_IDLEFETCH)
 		_starpu_data_request_prio_list_push_back(&idle_requests[handling_node], r);
-	else if (r->prefetch)
+	else if (r->prefetch > STARPU_FETCH)
 		_starpu_data_request_prio_list_push_back(&prefetch_requests[handling_node], r);
 	else
 		_starpu_data_request_prio_list_push_back(&data_requests[handling_node], r);
@@ -413,6 +414,10 @@ static void starpu_handle_data_request_completion(struct _starpu_data_request *r
 	/* Remove a reference on the destination replicate for the request */
 	if (dst_replicate)
 	{
+		if (dst_replicate->mc)
+			/* Make sure it stays there for the task.  */
+			dst_replicate->mc->nb_tasks_prefetch += r->nb_tasks_prefetch;
+
 		STARPU_ASSERT(dst_replicate->refcnt > 0);
 		dst_replicate->refcnt--;
 	}
@@ -463,7 +468,7 @@ static void starpu_handle_data_request_completion(struct _starpu_data_request *r
 }
 
 /* TODO : accounting to see how much time was spent working for other people ... */
-static int starpu_handle_data_request(struct _starpu_data_request *r, unsigned may_alloc, int prefetch)
+static int starpu_handle_data_request(struct _starpu_data_request *r, unsigned may_alloc, enum _starpu_is_prefetch prefetch)
 {
 	starpu_data_handle_t handle = r->handle;
 
@@ -538,7 +543,7 @@ static int starpu_handle_data_request(struct _starpu_data_request *r, unsigned m
 	return 0;
 }
 
-static int __starpu_handle_node_data_requests(struct _starpu_data_request_prio_list *reqlist, unsigned src_node, unsigned may_alloc, unsigned n, unsigned *pushed, unsigned prefetch)
+static int __starpu_handle_node_data_requests(struct _starpu_data_request_prio_list *reqlist, unsigned src_node, unsigned may_alloc, unsigned n, unsigned *pushed, enum _starpu_is_prefetch prefetch)
 {
 	struct _starpu_data_request *r;
 	struct _starpu_data_request_prio_list new_data_requests[prefetch + 1]; /* Indexed by prefetch level */
@@ -609,7 +614,7 @@ static int __starpu_handle_node_data_requests(struct _starpu_data_request_prio_l
 			ret = res;
 			/* Prefetch requests might have gotten promoted while in tmp list */
 			_starpu_data_request_prio_list_push_back(&new_data_requests[r->prefetch], r);
-			if (prefetch)
+			if (prefetch > STARPU_FETCH)
 				/* Prefetching more there would make the situation even worse */
 				break;
 		}
@@ -639,20 +644,25 @@ static int __starpu_handle_node_data_requests(struct _starpu_data_request_prio_l
 	if (i <= prefetch)
 	{
 		STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[src_node]);
-		if (!(_starpu_data_request_prio_list_empty(&new_data_requests[0])))
+		if (!(_starpu_data_request_prio_list_empty(&new_data_requests[STARPU_FETCH])))
+		{
+			_starpu_data_request_prio_list_push_prio_list_back(&new_data_requests[STARPU_FETCH], &data_requests[src_node]);
+			data_requests[src_node] = new_data_requests[STARPU_FETCH];
+		}
+		if (prefetch >= STARPU_TASK_PREFETCH && !(_starpu_data_request_prio_list_empty(&new_data_requests[STARPU_TASK_PREFETCH])))
 		{
-			_starpu_data_request_prio_list_push_prio_list_back(&new_data_requests[0], &data_requests[src_node]);
-			data_requests[src_node] = new_data_requests[0];
+			_starpu_data_request_prio_list_push_prio_list_back(&new_data_requests[STARPU_TASK_PREFETCH], &prefetch_requests[src_node]);
+			prefetch_requests[src_node] = new_data_requests[STARPU_TASK_PREFETCH];
 		}
-		if (prefetch >= 1 && !(_starpu_data_request_prio_list_empty(&new_data_requests[1])))
+		if (prefetch >= STARPU_PREFETCH && !(_starpu_data_request_prio_list_empty(&new_data_requests[STARPU_PREFETCH])))
 		{
-			_starpu_data_request_prio_list_push_prio_list_back(&new_data_requests[1], &prefetch_requests[src_node]);
-			prefetch_requests[src_node] = new_data_requests[1];
+			_starpu_data_request_prio_list_push_prio_list_back(&new_data_requests[STARPU_PREFETCH], &prefetch_requests[src_node]);
+			prefetch_requests[src_node] = new_data_requests[STARPU_PREFETCH];
 		}
-		if (prefetch >= 2 && !(_starpu_data_request_prio_list_empty(&new_data_requests[2])))
+		if (prefetch >= STARPU_IDLEFETCH && !(_starpu_data_request_prio_list_empty(&new_data_requests[STARPU_IDLEFETCH])))
 		{
-			_starpu_data_request_prio_list_push_prio_list_back(&new_data_requests[2], &idle_requests[src_node]);
-			idle_requests[src_node] = new_data_requests[2];
+			_starpu_data_request_prio_list_push_prio_list_back(&new_data_requests[STARPU_IDLEFETCH], &idle_requests[src_node]);
+			idle_requests[src_node] = new_data_requests[STARPU_IDLEFETCH];
 		}
 		STARPU_PTHREAD_MUTEX_UNLOCK(&data_requests_list_mutex[src_node]);
 
@@ -678,17 +688,17 @@ static int __starpu_handle_node_data_requests(struct _starpu_data_request_prio_l
 
 int _starpu_handle_node_data_requests(unsigned src_node, unsigned may_alloc, unsigned *pushed)
 {
-	return __starpu_handle_node_data_requests(data_requests, src_node, may_alloc, MAX_PENDING_REQUESTS_PER_NODE, pushed, 0);
+	return __starpu_handle_node_data_requests(data_requests, src_node, may_alloc, MAX_PENDING_REQUESTS_PER_NODE, pushed, STARPU_FETCH);
 }
 
 int _starpu_handle_node_prefetch_requests(unsigned src_node, unsigned may_alloc, unsigned *pushed)
 {
-	return __starpu_handle_node_data_requests(prefetch_requests, src_node, may_alloc, MAX_PENDING_PREFETCH_REQUESTS_PER_NODE, pushed, 1);
+	return __starpu_handle_node_data_requests(prefetch_requests, src_node, may_alloc, MAX_PENDING_PREFETCH_REQUESTS_PER_NODE, pushed, STARPU_PREFETCH);
 }
 
 int _starpu_handle_node_idle_requests(unsigned src_node, unsigned may_alloc, unsigned *pushed)
 {
-	return __starpu_handle_node_data_requests(idle_requests, src_node, may_alloc, MAX_PENDING_IDLE_REQUESTS_PER_NODE, pushed, 2);
+	return __starpu_handle_node_data_requests(idle_requests, src_node, may_alloc, MAX_PENDING_IDLE_REQUESTS_PER_NODE, pushed, STARPU_IDLEFETCH);
 }
 
 static int _handle_pending_node_data_requests(unsigned src_node, unsigned force)
@@ -839,11 +849,15 @@ int _starpu_check_that_no_data_request_is_pending(unsigned node)
 }
 
 
-void _starpu_update_prefetch_status(struct _starpu_data_request *r, unsigned prefetch)
+void _starpu_update_prefetch_status(struct _starpu_data_request *r, enum _starpu_is_prefetch prefetch)
 {
 	STARPU_ASSERT(r->prefetch > prefetch);
 	r->prefetch=prefetch;
 
+	if (prefetch >= STARPU_IDLEFETCH)
+		/* No possible actual change */
+		return;
+
 	/* We have to promote chained_request too! */
 	unsigned chained_req;
 	for (chained_req = 0; chained_req < r->next_req_count; chained_req++)
@@ -855,19 +869,20 @@ void _starpu_update_prefetch_status(struct _starpu_data_request *r, unsigned pre
 
 	STARPU_PTHREAD_MUTEX_LOCK(&data_requests_list_mutex[r->handling_node]);
 
+	int found = 1;
+
 	/* The request can be in a different list (handling request or the temp list)
-	 * we have to check that it is really in the prefetch list. */
+	 * we have to check that it is really in the prefetch or idle list. */
 	if (_starpu_data_request_prio_list_ismember(&prefetch_requests[r->handling_node], r))
-	{
-		_starpu_data_request_prio_list_erase(&prefetch_requests[r->handling_node],r);
-		_starpu_data_request_prio_list_push_back(&data_requests[r->handling_node],r);
-	}
-	/* The request can be in a different list (handling request or the temp list)
-	 * we have to check that it is really in the idle list. */
+		_starpu_data_request_prio_list_erase(&prefetch_requests[r->handling_node], r);
 	else if (_starpu_data_request_prio_list_ismember(&idle_requests[r->handling_node], r))
+		_starpu_data_request_prio_list_erase(&idle_requests[r->handling_node], r);
+	else
+		found = 0;
+
+	if (found)
 	{
-		_starpu_data_request_prio_list_erase(&idle_requests[r->handling_node],r);
-		if (prefetch == 1)
+		if (prefetch > STARPU_FETCH)
 			_starpu_data_request_prio_list_push_back(&prefetch_requests[r->handling_node],r);
 		else
 			_starpu_data_request_prio_list_push_back(&data_requests[r->handling_node],r);

+ 7 - 8
src/datawizard/data_request.h

@@ -79,12 +79,11 @@ LIST_TYPE(_starpu_data_request,
 	/** Whether the transfer is completed. */
 	unsigned completed;
 
-	/** Whether this is just a prefetch request:
-	 * 0 for fetch,
-	 * 1 for prefetch (dependencies have just been released)
-	 * 2 for idle (a good idea to do it some time, but no hurry at all)
-	 */
-	unsigned prefetch;
+	/** Whether this is just a prefetch request */
+	enum _starpu_is_prefetch prefetch;
+
+	/** Number of tasks which used this as a prefetch */
+	unsigned nb_tasks_prefetch;
 
 	/** Priority of the request. Default is 0 */
 	int prio;
@@ -151,7 +150,7 @@ struct _starpu_data_request *_starpu_create_data_request(starpu_data_handle_t ha
 							 int handling_node,
 							 enum starpu_data_access_mode mode,
 							 unsigned ndeps,
-							 unsigned is_prefetch,
+							 enum _starpu_is_prefetch is_prefetch,
 							 int prio,
 							 unsigned is_write_invalidation,
 							 const char *origin) STARPU_ATTRIBUTE_MALLOC;
@@ -162,5 +161,5 @@ void _starpu_data_request_append_callback(struct _starpu_data_request *r,
 					  void (*callback_func)(void *),
 					  void *callback_arg);
 
-void _starpu_update_prefetch_status(struct _starpu_data_request *r, unsigned prefetch);
+void _starpu_update_prefetch_status(struct _starpu_data_request *r, enum _starpu_is_prefetch prefetch);
 #endif // __DATA_REQUEST_H__

+ 1 - 1
src/datawizard/filters.c

@@ -193,7 +193,7 @@ static void _starpu_data_partition(starpu_data_handle_t initial_handle, starpu_d
 		int home_node = initial_handle->home_node;
 		if (home_node < 0 || (starpu_node_get_kind(home_node) != STARPU_CPU_RAM))
 			home_node = STARPU_MAIN_RAM;
-		int ret = _starpu_allocate_memory_on_node(initial_handle, &initial_handle->per_node[home_node], 0);
+		int ret = _starpu_allocate_memory_on_node(initial_handle, &initial_handle->per_node[home_node], STARPU_FETCH);
 #ifdef STARPU_DEVEL
 #warning we should reclaim memory if allocation failed
 #endif

+ 1 - 1
src/datawizard/interfaces/data_interface.c

@@ -693,7 +693,7 @@ void _starpu_check_if_valid_and_fetch_data_on_node(starpu_data_handle_t handle,
 	}
 	if (valid)
 	{
-		int ret = _starpu_fetch_data_on_node(handle, handle->home_node, replicate, STARPU_R, 0, 0, 0, NULL, NULL, 0, origin);
+		int ret = _starpu_fetch_data_on_node(handle, handle->home_node, replicate, STARPU_R, 0, STARPU_FETCH, 0, NULL, NULL, 0, origin);
 		STARPU_ASSERT(!ret);
 		_starpu_release_data_on_node(handle, handle->home_node, replicate);
 	}

+ 23 - 18
src/datawizard/memalloc.c

@@ -322,7 +322,7 @@ static int STARPU_ATTRIBUTE_WARN_UNUSED_RESULT transfer_subtree_to_node(starpu_d
 		{
 			/* This is the only copy, push it to destination */
 			struct _starpu_data_request *r;
-			r = _starpu_create_request_to_fetch_data(handle, dst_replicate, STARPU_R, 0, 0, NULL, NULL, 0, "transfer_subtree_to_node");
+			r = _starpu_create_request_to_fetch_data(handle, dst_replicate, STARPU_R, STARPU_FETCH, 0, NULL, NULL, 0, "transfer_subtree_to_node");
 			/* There is no way we don't need a request, since
 			 * source is OWNER, destination can't be having it */
 			STARPU_ASSERT(r);
@@ -546,7 +546,7 @@ static void reuse_mem_chunk(unsigned node, struct _starpu_data_replicate *new_re
 /* This function is called for memory chunks that are possibly in used (ie. not
  * in the cache). They should therefore still be associated to a handle. */
 /* mc_lock is held and may be temporarily released! */
-static size_t try_to_throw_mem_chunk(struct _starpu_mem_chunk *mc, unsigned node, struct _starpu_data_replicate *replicate, unsigned is_already_in_mc_list)
+static size_t try_to_throw_mem_chunk(struct _starpu_mem_chunk *mc, unsigned node, struct _starpu_data_replicate *replicate, unsigned is_already_in_mc_list, enum _starpu_is_prefetch is_prefetch)
 {
 	size_t freed = 0;
 
@@ -571,6 +571,10 @@ static size_t try_to_throw_mem_chunk(struct _starpu_mem_chunk *mc, unsigned node
 		/* Hasn't been used yet, avoid evicting it */
 		return 0;
 
+	if (mc->nb_tasks_prefetch && is_prefetch >= STARPU_TASK_PREFETCH)
+		/* We have not finished executing the tasks this was prefetched for */
+		return 0;
+
 	/* REDUX memchunk */
 	if (mc->relaxed_coherency == 2)
 	{
@@ -782,7 +786,7 @@ static int try_to_find_reusable_mc(unsigned node, starpu_data_handle_t data, str
 
 /* this function looks for a memory chunk that matches a given footprint in the
  * list of mem chunk that are not important */
-static int try_to_reuse_not_important_mc(unsigned node, starpu_data_handle_t data, struct _starpu_data_replicate *replicate, uint32_t footprint)
+static int try_to_reuse_not_important_mc(unsigned node, starpu_data_handle_t data, struct _starpu_data_replicate *replicate, uint32_t footprint, enum _starpu_is_prefetch is_prefetch)
 {
 	struct _starpu_mem_chunk *mc, *orig_next_mc, *next_mc;
 	int success = 0;
@@ -816,7 +820,7 @@ restart:
 		}
 
 		/* Note: this may unlock mc_list! */
-		success = try_to_throw_mem_chunk(mc, node, replicate, 1);
+		success = try_to_throw_mem_chunk(mc, node, replicate, 1, is_prefetch);
 
 		if (orig_next_mc)
 		{
@@ -841,11 +845,14 @@ restart:
  * Try to find a buffer currently in use on the memory node which has the given
  * footprint.
  */
-static int try_to_reuse_potentially_in_use_mc(unsigned node, starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, uint32_t footprint, int is_prefetch)
+static int try_to_reuse_potentially_in_use_mc(unsigned node, starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, uint32_t footprint, enum _starpu_is_prefetch is_prefetch)
 {
 	struct _starpu_mem_chunk *mc, *next_mc, *orig_next_mc;
 	int success = 0;
 
+	if (is_prefetch >= STARPU_IDLEFETCH)
+		/* Do not evict a MC just for an idle fetch */
+		return 0;
 	/*
 	 * We have to unlock mc_lock before locking header_lock, so we have
 	 * to be careful with the list.  We try to do just one pass, by
@@ -868,14 +875,11 @@ restart:
 		if (mc->remove_notify)
 			/* Somebody already working here, skip */
 			continue;
-		if (is_prefetch > 1)
-			/* Do not evict a MC just for an idle fetch */
-			continue;
-		if (is_prefetch == 1 && !mc->wontuse)
+		if (!mc->wontuse && is_prefetch >= STARPU_PREFETCH)
 			/* Do not evict something that we might reuse, just for a prefetch */
-			/* FIXME: but perhaps we won't have any task using it in
-                         * the close future, we should perhaps rather check
-                         * mc->replicate->refcnt? */
+			continue;
+		if (mc->nb_tasks_prefetch && is_prefetch >= STARPU_TASK_PREFETCH)
+			/* Do not evict something that we will reuse, just for a task prefetch */
 			continue;
 		if (mc->footprint != footprint || _starpu_data_interface_compare(handle->per_node[node].data_interface, handle->ops, mc->data->per_node[node].data_interface, mc->ops) != 1)
 			/* Not the right type of interface, skip */
@@ -889,7 +893,7 @@ restart:
 		}
 
 		/* Note: this may unlock mc_list! */
-		success = try_to_throw_mem_chunk(mc, node, replicate, 1);
+		success = try_to_throw_mem_chunk(mc, node, replicate, 1, is_prefetch);
 
 		if (orig_next_mc)
 		{
@@ -999,7 +1003,7 @@ restart2:
 				next_mc->remove_notify = &next_mc;
 			}
 			/* Note: this may unlock mc_list! */
-			freed += try_to_throw_mem_chunk(mc, node, NULL, 0);
+			freed += try_to_throw_mem_chunk(mc, node, NULL, 0, STARPU_FETCH);
 
 			if (orig_next_mc)
 			{
@@ -1218,7 +1222,7 @@ void starpu_memchunk_tidy(unsigned node)
 			}
 
 			_starpu_spin_unlock(&mc_lock[node]);
-			if (!_starpu_create_request_to_fetch_data(handle, &handle->per_node[target_node], STARPU_R, 2, 1, NULL, NULL, 0, "starpu_memchunk_tidy"))
+			if (!_starpu_create_request_to_fetch_data(handle, &handle->per_node[target_node], STARPU_R, STARPU_IDLEFETCH, 1, NULL, NULL, 0, "starpu_memchunk_tidy"))
 			{
 				/* No request was actually needed??
 				 * Odd, but cope with it.  */
@@ -1317,6 +1321,7 @@ static struct _starpu_mem_chunk *_starpu_memchunk_init(struct _starpu_data_repli
 	mc->size_interface = interface_size;
 	mc->remove_notify = NULL;
 	mc->diduse = 0;
+	mc->nb_tasks_prefetch = 0;
 	mc->wontuse = 0;
 
 	return mc;
@@ -1430,7 +1435,7 @@ void _starpu_request_mem_chunk_removal(starpu_data_handle_t handle, struct _star
  *
  */
 
-static starpu_ssize_t _starpu_allocate_interface(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, unsigned dst_node, unsigned is_prefetch)
+static starpu_ssize_t _starpu_allocate_interface(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, unsigned dst_node, enum _starpu_is_prefetch is_prefetch)
 {
 	unsigned attempts = 0;
 	starpu_ssize_t allocated_memory;
@@ -1514,7 +1519,7 @@ static starpu_ssize_t _starpu_allocate_interface(starpu_data_handle_t handle, st
 			reclaim -= freed;
 
 			/* Try to reuse an allocated data with the same interface (to avoid spurious free/alloc) */
-			if (_starpu_has_not_important_data && try_to_reuse_not_important_mc(dst_node, handle, replicate, footprint))
+			if (_starpu_has_not_important_data && try_to_reuse_not_important_mc(dst_node, handle, replicate, footprint, is_prefetch))
 				break;
 			if (try_to_reuse_potentially_in_use_mc(dst_node, handle, replicate, footprint, is_prefetch))
 			{
@@ -1596,7 +1601,7 @@ out:
 	return allocated_memory;
 }
 
-int _starpu_allocate_memory_on_node(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, unsigned is_prefetch)
+int _starpu_allocate_memory_on_node(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, enum _starpu_is_prefetch is_prefetch)
 {
 	starpu_ssize_t allocated_memory;
 

+ 9 - 1
src/datawizard/memalloc.h

@@ -26,6 +26,7 @@
 #include <datawizard/interfaces/data_interface.h>
 #include <datawizard/coherency.h>
 #include <datawizard/copy_driver.h>
+#include <datawizard/data_request.h>
 
 struct _starpu_data_replicate;
 
@@ -59,10 +60,17 @@ LIST_TYPE(_starpu_mem_chunk,
 	/** Whether the memchunk is in the clean part of the mc_list */
 	unsigned clean:1;
 	/** Was this chunk used since it got allocated?  */
+	/* FIXME: probably useless now with nb_tasks_prefetch */
 	unsigned diduse:1;
 	/** Was this chunk marked as "won't use"? */
 	unsigned wontuse:1;
 
+	/** The number of prefetches that we made for this mc for various tasks
+	 * This is also the number of tasks that we will wait to see use this mc before
+	 * we attempt to evict it.
+	 */
+	unsigned nb_tasks_prefetch;
+
 	/** the size of the data is only set when calling _starpu_request_mem_chunk_removal(),
 	 * it is needed to estimate how much memory is in mc_cache, and by
 	 * free_memory_on_node() which is called when the handle is no longer
@@ -84,7 +92,7 @@ void _starpu_init_mem_chunk_lists(void);
 void _starpu_deinit_mem_chunk_lists(void);
 void _starpu_mem_chunk_init_last(void);
 void _starpu_request_mem_chunk_removal(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, unsigned node, size_t size);
-int _starpu_allocate_memory_on_node(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, unsigned is_prefetch);
+int _starpu_allocate_memory_on_node(starpu_data_handle_t handle, struct _starpu_data_replicate *replicate, enum _starpu_is_prefetch is_prefetch);
 size_t _starpu_free_all_automatically_allocated_buffers(unsigned node);
 void _starpu_memchunk_recently_used(struct _starpu_mem_chunk *mc, unsigned node);
 void _starpu_memchunk_wont_use(struct _starpu_mem_chunk *m, unsigned nodec);

+ 6 - 6
src/datawizard/user_interactions.c

@@ -47,7 +47,7 @@ int starpu_data_request_allocation(starpu_data_handle_t handle, unsigned node)
 
 	_starpu_spin_lock(&handle->header_lock);
 
-	r = _starpu_create_data_request(handle, NULL, &handle->per_node[node], node, STARPU_NONE, 0, 1, 0, 0, "starpu_data_request_allocation");
+	r = _starpu_create_data_request(handle, NULL, &handle->per_node[node], node, STARPU_NONE, 0, STARPU_PREFETCH, 0, 0, "starpu_data_request_allocation");
 
 	/* we do not increase the refcnt associated to the request since we are
 	 * not waiting for its termination */
@@ -68,7 +68,7 @@ struct user_interaction_wrapper
 	starpu_pthread_mutex_t lock;
 	unsigned finished;
 	unsigned detached;
-	unsigned prefetch;
+	enum _starpu_is_prefetch prefetch;
 	unsigned async;
 	int prio;
 	void (*callback)(void *);
@@ -536,7 +536,7 @@ static void _prefetch_data_on_node(void *arg)
 }
 
 static
-int _starpu_prefetch_data_on_node_with_mode(starpu_data_handle_t handle, unsigned node, unsigned async, enum starpu_data_access_mode mode, unsigned prefetch, int prio)
+int _starpu_prefetch_data_on_node_with_mode(starpu_data_handle_t handle, unsigned node, unsigned async, enum starpu_data_access_mode mode, enum _starpu_is_prefetch prefetch, int prio)
 {
 	STARPU_ASSERT(handle);
 
@@ -596,12 +596,12 @@ int _starpu_prefetch_data_on_node_with_mode(starpu_data_handle_t handle, unsigne
 
 int starpu_data_fetch_on_node(starpu_data_handle_t handle, unsigned node, unsigned async)
 {
-	return _starpu_prefetch_data_on_node_with_mode(handle, node, async, STARPU_R, 0, 0);
+	return _starpu_prefetch_data_on_node_with_mode(handle, node, async, STARPU_R, STARPU_FETCH, 0);
 }
 
 int starpu_data_prefetch_on_node_prio(starpu_data_handle_t handle, unsigned node, unsigned async, int prio)
 {
-	return _starpu_prefetch_data_on_node_with_mode(handle, node, async, STARPU_R, 1, prio);
+	return _starpu_prefetch_data_on_node_with_mode(handle, node, async, STARPU_R, STARPU_PREFETCH, prio);
 }
 
 int starpu_data_prefetch_on_node(starpu_data_handle_t handle, unsigned node, unsigned async)
@@ -611,7 +611,7 @@ int starpu_data_prefetch_on_node(starpu_data_handle_t handle, unsigned node, uns
 
 int starpu_data_idle_prefetch_on_node_prio(starpu_data_handle_t handle, unsigned node, unsigned async, int prio)
 {
-	return _starpu_prefetch_data_on_node_with_mode(handle, node, async, STARPU_R, 2, prio);
+	return _starpu_prefetch_data_on_node_with_mode(handle, node, async, STARPU_R, STARPU_IDLEFETCH, prio);
 }
 
 int starpu_data_idle_prefetch_on_node(starpu_data_handle_t handle, unsigned node, unsigned async)

+ 1 - 1
src/datawizard/write_back.c

@@ -64,7 +64,7 @@ void _starpu_write_through_data(starpu_data_handle_t handle, unsigned requesting
 
 				struct _starpu_data_request *r;
 				r = _starpu_create_request_to_fetch_data(handle, &handle->per_node[node],
-									 STARPU_R, 2, 1, wt_callback, handle, 0, "_starpu_write_through_data");
+									 STARPU_R, STARPU_IDLEFETCH, 1, wt_callback, handle, 0, "_starpu_write_through_data");
 
 			        /* If no request was created, the handle was already up-to-date on the
 			         * node */

+ 2 - 2
src/debug/latency.c

@@ -34,7 +34,7 @@ void _starpu_benchmark_ping_pong(starpu_data_handle_t handle,
 		_starpu_spin_unlock(&handle->header_lock);
 
 		struct _starpu_data_replicate *replicate_0 = &handle->per_node[node0];
-		ret = _starpu_fetch_data_on_node(handle, node0, replicate_0, STARPU_RW, 0, 0, 0, NULL, NULL, 0, "_starpu_benchmark_ping_pong");
+		ret = _starpu_fetch_data_on_node(handle, node0, replicate_0, STARPU_RW, 0, STARPU_FETCH, 0, NULL, NULL, 0, "_starpu_benchmark_ping_pong");
 		STARPU_ASSERT(!ret);
 		_starpu_release_data_on_node(handle, node0, replicate_0);
 
@@ -44,7 +44,7 @@ void _starpu_benchmark_ping_pong(starpu_data_handle_t handle,
 		_starpu_spin_unlock(&handle->header_lock);
 
 		struct _starpu_data_replicate *replicate_1 = &handle->per_node[node1];
-		ret = _starpu_fetch_data_on_node(handle, node1, replicate_1, STARPU_RW, 0, 0, 0, NULL, NULL, 0, "_starpu_benchmark_ping_pong");
+		ret = _starpu_fetch_data_on_node(handle, node1, replicate_1, STARPU_RW, 0, STARPU_FETCH, 0, NULL, NULL, 0, "_starpu_benchmark_ping_pong");
 		STARPU_ASSERT(!ret);
 		_starpu_release_data_on_node(handle, node1, replicate_1);
 	}

+ 8 - 6
src/debug/traces/starpu_fxt.c

@@ -20,6 +20,7 @@
 #include <starpu.h>
 #include <common/config.h>
 #include <common/uthash.h>
+#include <datawizard/copy_driver.h>
 #include <string.h>
 
 #ifdef STARPU_HAVE_POTI
@@ -2268,13 +2269,14 @@ static void handle_mpi_data_set_tag(struct fxt_ev_64 *ev, struct starpu_fxt_opti
 	data->mpi_tag = tag;
 }
 
-static const char *copy_link_type(unsigned prefetch)
+static const char *copy_link_type(enum _starpu_is_prefetch prefetch)
 {
 	switch (prefetch)
 	{
-		case 0: return "F";
-		case 1: return "PF";
-		case 2: return "IF";
+		case STARPU_FETCH: return "F";
+		case STARPU_TASK_PREFETCH: return "TF";
+		case STARPU_PREFETCH: return "PF";
+		case STARPU_IDLEFETCH: return "IF";
 		default: STARPU_ASSERT(0);
 	}
 }
@@ -2285,7 +2287,7 @@ static void handle_start_driver_copy(struct fxt_ev_64 *ev, struct starpu_fxt_opt
 	unsigned dst = ev->param[1];
 	unsigned size = ev->param[2];
 	unsigned comid = ev->param[3];
-	unsigned prefetch = ev->param[4];
+	enum _starpu_is_prefetch prefetch = ev->param[4];
 	unsigned long handle = ev->param[5];
 	const char *link_type = copy_link_type(prefetch);
 
@@ -2367,7 +2369,7 @@ static void handle_end_driver_copy(struct fxt_ev_64 *ev, struct starpu_fxt_optio
 	unsigned dst = ev->param[1];
 	unsigned long size = ev->param[2];
 	unsigned comid = ev->param[3];
-	unsigned prefetch = ev->param[4];
+	enum _starpu_is_prefetch prefetch = ev->param[4];
 	const char *link_type = copy_link_type(prefetch);
 
 	char *prefix = options->file_prefix;

+ 2 - 0
src/debug/traces/starpu_paje.c

@@ -398,6 +398,7 @@ void _starpu_fxt_write_paje_header(FILE *file STARPU_ATTRIBUTE_UNUSED, struct st
 	/* Link types */
 	poti_DefineLinkType("MPIL", "MPIP", "MPICt", "MPICt", "MPI communication");
 	poti_DefineLinkType("F", "P", "Mm", "Mm", "Intra-node data Fetch");
+	poti_DefineLinkType("TF", "P", "Mm", "Mm", "Intra-node data TaskPreFetch");
 	poti_DefineLinkType("PF", "P", "Mm", "Mm", "Intra-node data PreFetch");
 	poti_DefineLinkType("IF", "P", "Mm", "Mm", "Intra-node data IdleFetch");
 	poti_DefineLinkType("WSL", "P", "W", "W", "Work steal");
@@ -551,6 +552,7 @@ void _starpu_fxt_write_paje_header(FILE *file STARPU_ATTRIBUTE_UNUSED, struct st
 6       No       MS     Nothing         \".0 .0 .0\"		\n\
 5       MPIL     MPIP	MPICt	MPICt   \"MPI communication\"\n\
 5       F       P	Mm	Mm      \"Intra-node data Fetch\"\n\
+5       TF      P	Mm	Mm      \"Intra-node data TaskPreFetch\"\n\
 5       PF      P	Mm	Mm      \"Intra-node data PreFetch\"\n\
 5       IF      P	Mm	Mm      \"Intra-node data IdleFetch\"\n\
 5       WSL     P	W	W       \"Work steal\"\n");

+ 2 - 2
src/drivers/cuda/driver_cuda.c

@@ -537,7 +537,7 @@ static int start_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *worke
 				async ? &task_finished[workerid][pipeline_idx] : NULL);
 		}
 #else
-#ifdef HAVE_LIBNVIDIA_ML
+#ifdef HAVE_NVMLDEVICEGETTOTALENERGYCONSUMPTION
 		unsigned long long energy_start = 0;
 		nvmlReturn_t nvmlRet = -1;
 		if (profiling && task->profiling_info)
@@ -561,7 +561,7 @@ static void finish_job_on_cuda(struct _starpu_job *j, struct _starpu_worker *wor
 	int profiling = starpu_profiling_status_get();
 
 
-#ifdef HAVE_LIBNVIDIA_ML
+#ifdef HAVE_NVMLDEVICEGETTOTALENERGYCONSUMPTION
 	if (profiling && j->task->profiling_info && j->task->profiling_info->energy_consumed)
 	{
 		unsigned long long energy_end;

+ 0 - 1
src/sched_policies/component_sched.c

@@ -160,7 +160,6 @@ void starpu_sched_component_prefetch_on_node(struct starpu_sched_component * com
 		int worker = starpu_bitmap_first(&component->workers_in_ctx);
 		unsigned memory_node = starpu_worker_get_memory_node(worker);
 		starpu_prefetch_task_input_on_node(task, memory_node);
-		task->prefetched = 1;
 	}
 }
 

+ 12 - 0
starpufft/src/cudax_kernels.cu

@@ -30,12 +30,16 @@
 	{			   \
 		dim3 dimGrid(n); \
 		func <<<dimGrid, 1, 0, starpu_cuda_get_local_stream()>>> args; \
+		cudaError_t status = cudaGetLastError(); \
+		if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status); \
 	} 					\
 	else 					\
 	{				     \
 		dim3 dimGrid(n / threads_per_block); \
 		dim3 dimBlock(threads_per_block); \
 		func <<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>> args; \
+		cudaError_t status = cudaGetLastError(); \
+		if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status); \
 	} \
 	cudaStreamSynchronize(starpu_cuda_get_local_stream()); \
 
@@ -85,12 +89,16 @@ extern "C" void STARPUFFT(cuda_twiddle_1d_host)(_cuComplex *out, const _cuComple
 		{			    \
 			dim3 dimGrid(n, m); \
 			func <<<dimGrid, 1, 0, starpu_cuda_get_local_stream()>>> args; \
+			cudaError_t status = cudaGetLastError(); \
+			if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status); \
 		} \
 		else \
 		{					      \
 			dim3 dimGrid(1, m / threads_per_dim); \
 			dim3 dimBlock(n, threads_per_dim); \
 			func <<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>> args; \
+			cudaError_t status = cudaGetLastError(); \
+			if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status); \
 		} \
 	} \
 	else \
@@ -100,12 +108,16 @@ extern "C" void STARPUFFT(cuda_twiddle_1d_host)(_cuComplex *out, const _cuComple
 			dim3 dimGrid(n / threads_per_dim, 1); \
 			dim3 dimBlock(threads_per_dim, m); \
 			func <<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>> args; \
+			cudaError_t status = cudaGetLastError(); \
+			if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status); \
 		} \
 		else \
 		{							\
 			dim3 dimGrid(n / threads_per_dim, m / threads_per_dim); \
 			dim3 dimBlock(threads_per_dim, threads_per_dim); \
 			func <<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>> args; \
+			cudaError_t status = cudaGetLastError(); \
+			if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status); \
 		} \
 	} \
 	cudaStreamSynchronize(starpu_cuda_get_local_stream()); \

+ 2 - 0
starpurm/dev/cuda_vector_scale/vs_cuda_kernel.cu

@@ -59,4 +59,6 @@ extern "C" void vector_scale_cuda_func(void *cl_buffers[], void *cl_arg)
 	unsigned nb_threads_per_block = 64;
 	unsigned nb_blocks = (n + nb_threads_per_block-1) / nb_threads_per_block;
 	vector_scale_cuda_kernel<<<nb_blocks,nb_threads_per_block,0,starpu_cuda_get_local_stream()>>>(vector, n, scalar);
+	cudaError_t status = cudaGetLastError();
+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 }

+ 2 - 0
tests/datawizard/acquire_release_cuda.cu

@@ -26,4 +26,6 @@ extern "C" void increment_cuda(void *descr[], void *cl_arg)
 	unsigned *val = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
 
 	_increment_cuda_codelet<<<1,1, 0, starpu_cuda_get_local_stream()>>>(val);
+	cudaError_t status = cudaGetLastError();
+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 }

+ 2 - 0
tests/datawizard/cuda_codelet_unsigned_inc.cu

@@ -27,4 +27,6 @@ extern "C" void cuda_codelet_unsigned_inc(void *descr[], void *cl_arg)
 	unsigned *val = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
 
 	_cuda_unsigned_inc<<<1,1, 0, starpu_cuda_get_local_stream()>>>(val);
+	cudaError_t status = cudaGetLastError();
+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 }

+ 2 - 0
tests/datawizard/interfaces/bcsr/bcsr_cuda.cu

@@ -60,6 +60,8 @@ extern "C" void test_bcsr_cuda_func(void *buffers[], void *args)
 
         bcsr_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>>
 		(val, nnz, ret, factor);
+	error = cudaGetLastError();
+	if (error != cudaSuccess) STARPU_CUDA_REPORT_ERROR(error);
 
 	error = cudaMemcpyAsync(&bcsr_config.copy_failed,
 			   ret,

+ 2 - 0
tests/datawizard/interfaces/block/block_cuda.cu

@@ -70,6 +70,8 @@ extern "C" void test_block_cuda_func(void *buffers[], void *args)
 
         block_cuda<<<1,1, 0, starpu_cuda_get_local_stream()>>>
 		(block, nx, ny, nz, ldy, ldz, factor, ret);
+	error = cudaGetLastError();
+	if (error != cudaSuccess) STARPU_CUDA_REPORT_ERROR(error);
 	error = cudaMemcpyAsync(&block_config.copy_failed, ret, sizeof(int), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
 	if (error != cudaSuccess)
 		STARPU_CUDA_REPORT_ERROR(error);

+ 2 - 0
tests/datawizard/interfaces/coo/coo_cuda.cu

@@ -57,6 +57,8 @@ extern "C" void test_coo_cuda_func(void *buffers[], void *args)
 
         coo_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>>
 		(val, nvalues, ret, factor);
+	error = cudaGetLastError();
+	if (error != cudaSuccess) STARPU_CUDA_REPORT_ERROR(error);
 
 	error = cudaMemcpyAsync(&coo_config.copy_failed,
 			   ret,

+ 2 - 0
tests/datawizard/interfaces/csr/csr_cuda.cu

@@ -56,6 +56,8 @@ extern "C" void test_csr_cuda_func(void *buffers[], void *args)
 		STARPU_CUDA_REPORT_ERROR(error);
 
         csr_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>> (val, nnz, ret, factor);
+	error = cudaGetLastError();
+	if (error != cudaSuccess) STARPU_CUDA_REPORT_ERROR(error);
 
 	error = cudaMemcpyAsync(&csr_config.copy_failed,
 			   ret,

+ 2 - 0
tests/datawizard/interfaces/matrix/matrix_cuda.cu

@@ -59,6 +59,8 @@ extern "C" void test_matrix_cuda_func(void *buffers[], void *args)
 		STARPU_CUDA_REPORT_ERROR(error);
 
         matrix_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>>(val, n, ret, factor);
+	error = cudaGetLastError();
+	if (error != cudaSuccess) STARPU_CUDA_REPORT_ERROR(error);
 
 	error = cudaMemcpyAsync(&matrix_config.copy_failed,
 			   ret,

+ 2 - 0
tests/datawizard/interfaces/multiformat/multiformat_conversion_codelets_cuda.cu

@@ -45,4 +45,6 @@ extern "C" void cpu_to_cuda_cuda_func(void *buffers[], void *_args)
 	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
 
         cpu_to_cuda_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>>(src, dst, n);
+        cudaError_t status = cudaGetLastError();
+        if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 }

+ 2 - 0
tests/datawizard/interfaces/multiformat/multiformat_cuda.cu

@@ -65,6 +65,8 @@ extern "C" void test_multiformat_cuda_func(void *buffers[], void *args)
 		STARPU_CUDA_REPORT_ERROR(error);
 
         multiformat_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>>(soa, n, ret, factor);
+	error = cudaGetLastError();
+	if (error != cudaSuccess) STARPU_CUDA_REPORT_ERROR(error);
 
 	error = cudaMemcpyAsync(&multiformat_config.copy_failed,
 			   ret,

+ 2 - 0
tests/datawizard/interfaces/tensor/tensor_cuda.cu

@@ -75,6 +75,8 @@ extern "C" void test_tensor_cuda_func(void *buffers[], void *args)
 
         tensor_cuda<<<1,1, 0, starpu_cuda_get_local_stream()>>>
 		(tensor, nx, ny, nz, nt, ldy, ldz, ldt, factor, ret);
+	error = cudaGetLastError();
+	if (error != cudaSuccess) STARPU_CUDA_REPORT_ERROR(error);
 	error = cudaMemcpyAsync(&tensor_config.copy_failed, ret, sizeof(int), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
 	if (error != cudaSuccess)
 		STARPU_CUDA_REPORT_ERROR(error);

+ 2 - 0
tests/datawizard/interfaces/variable/variable_cuda.cu

@@ -55,6 +55,8 @@ extern "C" void test_variable_cuda_func(void *buffers[], void *args)
 	unsigned nblocks = 1;
 
         variable_cuda<<<nblocks,threads_per_block,0,starpu_cuda_get_local_stream()>>>(val, ret, factor);
+	error = cudaGetLastError();
+	if (error != cudaSuccess) STARPU_CUDA_REPORT_ERROR(error);
 	error = cudaMemcpyAsync(&variable_config.copy_failed,
 			   ret,
 			   sizeof(int),

+ 2 - 0
tests/datawizard/interfaces/vector/vector_cuda.cu

@@ -55,6 +55,8 @@ extern "C" void test_vector_cuda_func(void *buffers[], void *args)
 	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
 
         framework_cuda<<<nblocks,threads_per_block,0,starpu_cuda_get_local_stream()>>>(val, n, ret, factor);
+	error = cudaGetLastError();
+	if (error != cudaSuccess) STARPU_CUDA_REPORT_ERROR(error);
 	error = cudaMemcpyAsync(&vector_config.copy_failed, ret, sizeof(int), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
 	if (error != cudaSuccess)
 	{

+ 2 - 0
tests/datawizard/scal_cuda.cu

@@ -32,4 +32,6 @@ extern "C" void scal_func_cuda(void *buffers[], void *_args)
 	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
 
         vector_mult_cuda<<<nblocks,threads_per_block,0,starpu_cuda_get_local_stream()>>>(val, n);
+	cudaError_t status = cudaGetLastError();
+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 }

+ 2 - 0
tests/datawizard/scratch_cuda.cu

@@ -46,4 +46,6 @@ extern "C" void cuda_f(void *descr[], void *_args)
 	unsigned nthread_per_block = STARPU_MIN(MAXTHREADSPERBLOCK, (nx / nblocks));
 
 	increment_vector<<<nblocks, nthread_per_block, 0, starpu_cuda_get_local_stream()>>>(v, tmp, nx);
+	cudaError_t status = cudaGetLastError();
+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 }

+ 4 - 0
tests/datawizard/sync_and_notify_data_kernels.cu

@@ -30,6 +30,8 @@ extern "C" void cuda_codelet_incA(void *descr[], void *_args)
 	unsigned *v = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
 
 	_cuda_incA<<<1,1, 0, starpu_cuda_get_local_stream()>>>(v);
+	cudaError_t status = cudaGetLastError();
+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 }
 
 /*
@@ -46,4 +48,6 @@ extern "C" void cuda_codelet_incC(void *descr[], void *_args)
 	unsigned *v = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
 
 	_cuda_incC<<<1,1, 0, starpu_cuda_get_local_stream()>>>(v);
+	cudaError_t status = cudaGetLastError();
+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 }

+ 2 - 0
tests/main/increment.cu

@@ -27,4 +27,6 @@ extern "C" void cuda_host_increment(void *descr[], void *_args)
 	unsigned *var = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
 
 	cuda_increment<<<1,1, 0, starpu_cuda_get_local_stream()>>>(var);
+	cudaError_t status = cudaGetLastError();
+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 }

+ 2 - 0
tests/overlap/long_kernel.cu

@@ -29,4 +29,6 @@ extern "C" void long_kernel_cuda(unsigned long niters)
 	dim3 dimBlock(1,1);
 	dim3 dimGrid(1,1);
 	long_kernel<<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>>(niters);
+	cudaError_t status = cudaGetLastError();
+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 }

+ 8 - 1
tools/gdbinit

@@ -189,6 +189,9 @@ define starpu-workers
     if $worker->status == STATUS_WAITING
       set $status="WAITING"
     end
+    if $worker->status == STATUS_SLEEPING_SCHEDULING
+      set $status="SLEEPING_SCHEDULING"
+    end
     if $worker->status == STATUS_SLEEPING
       set $status="SLEEPING"
     end
@@ -635,7 +638,11 @@ define starpu-print-requests-tree
 end
 
 define starpu-print-requests-list
-  starpu-print-requests-tree $arg0.tree.root
+  if _starpu_debug
+    starpu-print-requests-treelist &$arg0.list
+  else
+    starpu-print-requests-tree $arg0.tree.root
+  end
 end
 
 define starpu-print-requests