Преглед изворни кода

Document that one should check that the kernel did properly start

Samuel Thibault пре 4 година
родитељ
комит
75e93fa729
51 измењених фајлова са 123 додато и 0 уклоњено
  1. 2 0
      doc/doxygen/chapters/210_check_list_performance.doxy
  2. 2 0
      doc/doxygen/chapters/code/vector_scal_cuda.c
  3. 2 0
      doc/tutorial/vector_scal_cuda.cu
  4. 2 0
      examples/basic_examples/block_cuda.cu
  5. 2 0
      examples/basic_examples/multiformat_conversion_codelets_cuda.cu
  6. 2 0
      examples/basic_examples/multiformat_cuda.cu
  7. 2 0
      examples/basic_examples/variable_kernels.cu
  8. 2 0
      examples/basic_examples/vector_scal_cuda.cu
  9. 2 0
      examples/filters/custom_mf/conversion.cu
  10. 2 0
      examples/filters/custom_mf/cuda.cu
  11. 2 0
      examples/filters/fblock_cuda.cu
  12. 4 0
      examples/filters/fmultiple_cuda.cu
  13. 2 0
      examples/incrementer/incrementer_kernels.cu
  14. 2 0
      examples/interface/complex_kernels.cu
  15. 2 0
      examples/pi/SobolQRNG/sobol_gpu.cu
  16. 4 0
      examples/pi/pi_kernel.cu
  17. 4 0
      examples/pi/pi_redux_kernel.cu
  18. 2 0
      examples/reductions/dot_product_kernels.cu
  19. 2 0
      examples/sched_ctx/axpy_partition_gpu.cu
  20. 2 0
      examples/spmv/spmv_cuda.cu
  21. 2 0
      examples/stencil/life_cuda.cu
  22. 2 0
      examples/stencil/shadow.cu
  23. 2 0
      julia/examples/mult/gpu_mult.cu
  24. 2 0
      julia/examples/old_examples/gpu_mult.cu
  25. 2 0
      julia/examples/old_examples/mandelbrot/gpu_mandelbrot.cu
  26. 2 0
      julia/examples/old_examples/mandelbrot/gpu_mandelbrot_between.cu
  27. 2 0
      julia/examples/old_examples/mult/gpu_mult.cu
  28. 4 0
      julia/examples/old_examples/nbody/gpu_nbody.cu
  29. 2 0
      julia/examples/old_examples/nbody/gpu_nbody_between.cu
  30. 4 0
      julia/src/compiler/expressions.jl
  31. 2 0
      mpi/tests/ring_kernel.cu
  32. 1 0
      src/datawizard/memalloc.c
  33. 12 0
      starpufft/src/cudax_kernels.cu
  34. 2 0
      starpurm/dev/cuda_vector_scale/vs_cuda_kernel.cu
  35. 2 0
      tests/datawizard/acquire_release_cuda.cu
  36. 2 0
      tests/datawizard/cuda_codelet_unsigned_inc.cu
  37. 2 0
      tests/datawizard/interfaces/bcsr/bcsr_cuda.cu
  38. 2 0
      tests/datawizard/interfaces/block/block_cuda.cu
  39. 2 0
      tests/datawizard/interfaces/coo/coo_cuda.cu
  40. 2 0
      tests/datawizard/interfaces/csr/csr_cuda.cu
  41. 2 0
      tests/datawizard/interfaces/matrix/matrix_cuda.cu
  42. 2 0
      tests/datawizard/interfaces/multiformat/multiformat_conversion_codelets_cuda.cu
  43. 2 0
      tests/datawizard/interfaces/multiformat/multiformat_cuda.cu
  44. 2 0
      tests/datawizard/interfaces/tensor/tensor_cuda.cu
  45. 2 0
      tests/datawizard/interfaces/variable/variable_cuda.cu
  46. 2 0
      tests/datawizard/interfaces/vector/vector_cuda.cu
  47. 2 0
      tests/datawizard/scal_cuda.cu
  48. 2 0
      tests/datawizard/scratch_cuda.cu
  49. 4 0
      tests/datawizard/sync_and_notify_data_kernels.cu
  50. 2 0
      tests/main/increment.cu
  51. 2 0
      tests/overlap/long_kernel.cu

+ 2 - 0
doc/doxygen/chapters/210_check_list_performance.doxy

@@ -91,6 +91,8 @@ operations to avoid this issue. For instance:
 
 \code{.c}
 func <<<grid,block,0,starpu_cuda_get_local_stream()>>> (foo, bar);
+cudaError_t status = cudaGetLastError();
+if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 cudaStreamSynchronize(starpu_cuda_get_local_stream());
 \endcode
 

+ 2 - 0
doc/doxygen/chapters/code/vector_scal_cuda.c

@@ -35,6 +35,8 @@ extern "C" void scal_cuda_func(void *buffers[], void *_args)
         unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
 
         vector_mult_cuda<<<nblocks,threads_per_block, 0, starpu_cuda_get_local_stream()>>>(n, val, *factor);
+        cudaError_t status = cudaGetLastError();
+        if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 
         cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }

+ 2 - 0
doc/tutorial/vector_scal_cuda.cu

@@ -35,6 +35,8 @@ extern "C" void vector_scal_cuda(void *buffers[], void *_args)
         unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
 
         vector_mult_cuda<<<nblocks,threads_per_block, 0, starpu_cuda_get_local_stream()>>>(val, n, *factor);
+        cudaError_t status = cudaGetLastError();
+        if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 
         cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }

+ 2 - 0
examples/basic_examples/block_cuda.cu

@@ -40,5 +40,7 @@ extern "C" void cuda_codelet(void *descr[], void *_args)
         float *multiplier = (float *)_args;
 
         cuda_block<<<1,1, 0, starpu_cuda_get_local_stream()>>>(block, nx, ny, nz, ldy, ldz, *multiplier);
+        cudaError_t status = cudaGetLastError();
+        if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }

+ 2 - 0
examples/basic_examples/multiformat_conversion_codelets_cuda.cu

@@ -44,4 +44,6 @@ extern "C" void cpu_to_cuda_cuda_func(void *buffers[], void *_args)
 	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
 
         cpu_to_cuda_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>>(src, dst, n);
+        cudaError_t status = cudaGetLastError();
+        if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 }

+ 2 - 0
examples/basic_examples/multiformat_cuda.cu

@@ -39,6 +39,8 @@ extern "C" void multiformat_scal_cuda_func(void *buffers[], void *_args)
 	unsigned threads_per_block = 64;
 	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
         multiformat_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>>(soa, n);
+        cudaError_t status = cudaGetLastError();
+        if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 
 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }

+ 2 - 0
examples/basic_examples/variable_kernels.cu

@@ -27,5 +27,7 @@ extern "C" void cuda_codelet(void *descr[], void *_args)
 	float *val = (float *)STARPU_VARIABLE_GET_PTR(descr[0]);
 
 	cuda_variable<<<1,1, 0, starpu_cuda_get_local_stream()>>>(val);
+	cudaError_t status = cudaGetLastError();
+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }

+ 2 - 0
examples/basic_examples/vector_scal_cuda.cu

@@ -41,4 +41,6 @@ extern "C" void scal_cuda_func(void *buffers[], void *_args)
 	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
 
         vector_mult_cuda<<<nblocks,threads_per_block,0,starpu_cuda_get_local_stream()>>>(n, val, *factor);
+        cudaError_t status = cudaGetLastError();
+        if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 }

+ 2 - 0
examples/filters/custom_mf/conversion.cu

@@ -45,4 +45,6 @@ extern "C" void cpu_to_cuda_cuda_func(void *buffers[], void *_args)
 	unsigned threads_per_block = 64;
 	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
         custom_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>>(aop, n, x, y);
+        cudaError_t status = cudaGetLastError();
+        if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 }

+ 2 - 0
examples/filters/custom_mf/cuda.cu

@@ -39,4 +39,6 @@ extern "C" void custom_scal_cuda_func(void *buffers[], void *_args)
 	unsigned threads_per_block = 64;
 	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
         scal_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>>(n, x, y);
+        cudaError_t status = cudaGetLastError();
+        if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 }

+ 2 - 0
examples/filters/fblock_cuda.cu

@@ -43,4 +43,6 @@ extern "C" void cuda_func(void *buffers[], void *_args)
 
         /* TODO: use more blocks and threads in blocks */
         fblock_cuda<<<1,1, 0, starpu_cuda_get_local_stream()>>>(block, nx, ny, nz, ldy, ldz, *factor);
+        cudaError_t status = cudaGetLastError();
+        if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 }

+ 4 - 0
examples/filters/fmultiple_cuda.cu

@@ -44,6 +44,8 @@ extern "C" void fmultiple_check_scale_cuda(void *buffers[], void *cl_arg)
 
         /* TODO: use more vals and threads in vals */
 	_fmultiple_check_scale_cuda<<<1,1, 0, starpu_cuda_get_local_stream()>>>(val, nx, ny, ld, start, factor);
+	cudaError_t status = cudaGetLastError();
+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 }
 
 static __global__ void _fmultiple_check_cuda(int *val, int nx, int ny, unsigned ld, int start, int factor)
@@ -71,4 +73,6 @@ extern "C" void fmultiple_check_cuda(void *buffers[], void *cl_arg)
 
         /* TODO: use more vals and threads in vals */
 	_fmultiple_check_cuda<<<1,1, 0, starpu_cuda_get_local_stream()>>>(val, nx, ny, ld, start, factor);
+	cudaError_t status = cudaGetLastError();
+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 }

+ 2 - 0
examples/incrementer/incrementer_kernels.cu

@@ -32,4 +32,6 @@ extern "C" void cuda_codelet(void *descr[], void *_args)
 	float *val = (float *)STARPU_VECTOR_GET_PTR(descr[0]);
 
 	cuda_incrementer<<<1,1, 0, starpu_cuda_get_local_stream()>>>(val);
+	cudaError_t status = cudaGetLastError();
+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 }

+ 2 - 0
examples/interface/complex_kernels.cu

@@ -44,4 +44,6 @@ extern "C" void copy_complex_codelet_cuda(void *descr[], void *_args)
 	unsigned nblocks = (nx + threads_per_block-1) / threads_per_block;
 
         complex_copy_cuda<<<nblocks, threads_per_block, 0, starpu_cuda_get_local_stream()>>>(o_real, o_imaginary, i_real, i_imaginary, nx);
+        cudaError_t status = cudaGetLastError();
+        if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 }

+ 2 - 0
examples/pi/SobolQRNG/sobol_gpu.cu

@@ -165,4 +165,6 @@ extern "C" void sobolGPU(int n_vectors, int n_dimensions, unsigned int *d_direct
 
     // Execute GPU kernel
     sobolGPU_kernel<<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>>(n_vectors, n_dimensions, d_directions, d_output);
+    cudaError_t status = cudaGetLastError();
+    if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 }

+ 4 - 0
examples/pi/pi_kernel.cu

@@ -137,12 +137,16 @@ extern "C" void cuda_kernel(void *descr[], void *cl_arg)
 	/* each entry of per_block_cnt contains the number of successful shots
 	 * in the corresponding block. */
 	monte_carlo<<<nblocks, nthread_per_block, 0, starpu_cuda_get_local_stream()>>>(random_numbers_x, random_numbers_y, nx, per_block_cnt);
+	cures = cudaGetLastError();
+	if (cures != cudaSuccess) STARPU_CUDA_REPORT_ERROR(cures);
 
 	/* Note that we do not synchronize between kernel calls because there is an implicit serialization */
 
 	/* compute the total number of successful shots by adding the elements
 	 * of the per_block_cnt array */
 	sum_per_block_cnt<<<1, nblocks, 0, starpu_cuda_get_local_stream()>>>(per_block_cnt, cnt);
+	cures = cudaGetLastError();
+	if (cures != cudaSuccess) STARPU_CUDA_REPORT_ERROR(cures);
 	cures = cudaStreamSynchronize(starpu_cuda_get_local_stream());
 	if (cures)
 		STARPU_CUDA_REPORT_ERROR(cures);

+ 4 - 0
examples/pi/pi_redux_kernel.cu

@@ -115,12 +115,16 @@ extern "C" void pi_redux_cuda_kernel(float *x, float *y, unsigned n, unsigned lo
 	/* each entry of per_block_cnt contains the number of successful shots
 	 * in the corresponding block. */
 	monte_carlo<<<nblocks, nthread_per_block, 0, starpu_cuda_get_local_stream()>>>(x, y, n, per_block_cnt);
+	cures = cudaGetLastError();
+	if (cures != cudaSuccess) STARPU_CUDA_REPORT_ERROR(cures);
 
 	/* Note that we do not synchronize between kernel calls because there is an implicit serialization */
 
 	/* compute the total number of successful shots by adding the elements
 	 * of the per_block_cnt array */
 	sum_per_block_cnt<<<1, nblocks, 0, starpu_cuda_get_local_stream()>>>(per_block_cnt, shot_cnt);
+	cures = cudaGetLastError();
+	if (cures != cudaSuccess) STARPU_CUDA_REPORT_ERROR(cures);
 	cures = cudaStreamSynchronize(starpu_cuda_get_local_stream());
 	if (cures)
 		STARPU_CUDA_REPORT_ERROR(cures);

+ 2 - 0
examples/reductions/dot_product_kernels.cu

@@ -33,4 +33,6 @@ extern "C" void redux_cuda_func(void *descr[], void *_args)
 	DOT_TYPE *dotb = (DOT_TYPE *)STARPU_VARIABLE_GET_PTR(descr[1]);
 
 	cuda_redux<<<1,1, 0, starpu_cuda_get_local_stream()>>>(dota, dotb);
+	cudaError_t status = cudaGetLastError();
+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 }

+ 2 - 0
examples/sched_ctx/axpy_partition_gpu.cu

@@ -73,4 +73,6 @@ extern "C" void cuda_axpy(void *descr[], void *_args)
 	__P_HOSTSETUP(saxpy_partitioned,dim3(dimensions,1,1),dimensions,0,SM_mapping_start,SM_allocation,stream);
 
   	saxpy_partitioned<<<width,dimensions,0,stream>>>(__P_HKARGS,n,a,x,y);
+	cudaError_t status = cudaGetLastError();
+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 }

+ 2 - 0
examples/spmv/spmv_cuda.cu

@@ -97,6 +97,8 @@ extern "C" void spmv_kernel_cuda(void *descr[], void *args)
 
 	spmv_kernel_3<<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>>
 		(nnz, nrow, nzval, colind, rowptr, firstentry, vecin, nx_in, vecout, nx_out);
+	cudaError_t status = cudaGetLastError();
+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 }
 
 

+ 2 - 0
examples/stencil/life_cuda.cu

@@ -73,4 +73,6 @@ extern "C" void cuda_life_update_host(int bz, const TYPE *old, TYPE *newp, int n
 	dim3 dimGrid((nx + threads_per_dim_x-1) / threads_per_dim_x, (ny + threads_per_dim_y-1) / threads_per_dim_y);
 #endif
 	cuda_life_update <<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>> (bz, old, newp, nx, ny, nz, ldy, ldz, iter);
+	cudaError_t status = cudaGetLastError();
+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 }

+ 2 - 0
examples/stencil/shadow.cu

@@ -53,4 +53,6 @@ extern "C" void cuda_shadow_host(int bz, TYPE *ptr, int nx, int ny, int nz, int
 	dim3 dimGrid((nx + threads_per_dim_x-1) / threads_per_dim_x, (ny + threads_per_dim_y-1) / threads_per_dim_y);
 #endif
 	cuda_shadow <<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>> (bz, ptr, nx, ny, nz, ldy, ldz, i);
+	cudaError_t status = cudaGetLastError();
+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 }

+ 2 - 0
julia/examples/mult/gpu_mult.cu

@@ -79,6 +79,8 @@ extern "C" void gpu_mult(void * descr[], void * args)
 	gpuMultKernel
 		<<< nblocks, THREADS_PER_BLOCK, 0, NULL /*starpu_cuda_get_local_stream()*/
 		>>> (nxC, nyC, nyA, ldA, ldB, ldC, d_subA, d_subB, d_subC);
+	cudaError_t status = cudaGetLastError();
+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 
 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 

+ 2 - 0
julia/examples/old_examples/gpu_mult.cu

@@ -78,6 +78,8 @@ extern "C" void gpu_mult(void * descr[], void * args)
 	gpuMultKernel
 		<<< nblocks, THREADS_PER_BLOCK, 0, starpu_cuda_get_local_stream()
 		>>> (nxC, nyC, nyA, ldA, ldB, ldC, d_subA, d_subB, d_subC);
+	cudaError_t status = cudaGetLastError();
+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 
 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 

+ 2 - 0
julia/examples/old_examples/mandelbrot/gpu_mandelbrot.cu

@@ -106,6 +106,8 @@ extern "C" void gpu_mandelbrot(void *descr[], void *args)
   nblocks = (nxP * nyP + THREADS_PER_BLOCK - 1)/THREADS_PER_BLOCK;
 
   gpuMandelbrotKernel <<< nblocks, THREADS_PER_BLOCK, 0, starpu_cuda_get_local_stream() >>> (nxP, nyP, ldP, d_subP, *params);
+  cudaError_t status = cudaGetLastError();
+  if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 
   cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }

+ 2 - 0
julia/examples/old_examples/mandelbrot/gpu_mandelbrot_between.cu

@@ -123,6 +123,8 @@ extern "C" void CUDA_mandelbrot(void** buffers_uwrYFDVe, void* cl_arg_uwrYFDVe)
              ptr_qoUGBRtY, local_height, conv_limit, ptr_A5zD9sJZ, 
              ld_A5zD9sJZ);
     ;
+    cudaError_t status = cudaGetLastError();
+    if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
     cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 

+ 2 - 0
julia/examples/old_examples/mult/gpu_mult.cu

@@ -78,6 +78,8 @@ extern "C" void gpu_mult(void * descr[], void * args)
 	gpuMultKernel
 		<<< nblocks, THREADS_PER_BLOCK, 0, starpu_cuda_get_local_stream()
 		>>> (nxC, nyC, nyA, ldA, ldB, ldC, d_subA, d_subB, d_subC);
+	cudaError_t status = cudaGetLastError();
+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 
 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 

+ 4 - 0
julia/examples/old_examples/nbody/gpu_nbody.cu

@@ -94,6 +94,8 @@ extern "C" void gpu_nbody(void * descr[], void * args)
   gpuNbodyKernel
     <<< nblocks, THREADS_PER_BLOCK, 0, starpu_cuda_get_local_stream()
     >>> (d_P,  d_subA, d_M, nxP, nxA, nxM, ldP, ldA, *params);
+  cudaError_t status = cudaGetLastError();
+  if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 
   cudaStreamSynchronize(starpu_cuda_get_local_stream());
 
@@ -156,6 +158,8 @@ extern "C" void gpu_nbody2(void * descr[], void *args)
   gpuNbody2Kernel
     <<< nblocks, THREADS_PER_BLOCK, 0, starpu_cuda_get_local_stream()
     >>> (d_subP, d_subV, d_subA, nxP, nxV, nxA, ldP, ldV, ldA, *params);
+  cudaError_t status = cudaGetLastError();
+  if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 
   cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }

+ 2 - 0
julia/examples/old_examples/nbody/gpu_nbody_between.cu

@@ -161,6 +161,8 @@ extern "C" void CUDA_nbody_updt(void** buffers_gj6UYWT4, void* cl_arg_gj6UYWT4)
              ld_jJ5f8wMA, ptr_piPvdbTs, ld_piPvdbTs, ptr_JBaPgPiT, 
              ptr_0STm2S4k, ld_0STm2S4k);
     ;
+    cudaError_t status = cudaGetLastError();
+    if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
     cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }
 

+ 4 - 0
julia/src/compiler/expressions.jl

@@ -335,6 +335,10 @@ function print(io :: IO, expr :: StarpuExprCudaCall ; indent = 0,restrict=false)
 
     print(io, ");")
     print_newline(io, indent)
+    print(io, "cudaError_t status = cudaGetLastError();")
+    print_newline(io, indent)
+    print(io, "if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);")
+    print_newline(io, indent)
 
 end
 

+ 2 - 0
mpi/tests/ring_kernel.cu

@@ -27,5 +27,7 @@ extern "C" void increment_cuda(void *descr[], void *_args)
 	int *tokenptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
 
 	cuda_incrementer<<<1,1, 0, starpu_cuda_get_local_stream()>>>(tokenptr);
+	cudaError_t status = cudaGetLastError();
+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 	cudaStreamSynchronize(starpu_cuda_get_local_stream());
 }

+ 1 - 0
src/datawizard/memalloc.c

@@ -873,6 +873,7 @@ restart:
 			continue;
 		if (is_prefetch == 1 && !mc->wontuse)
 			/* Do not evict something that we might reuse, just for a prefetch */
+			/* TODO ! */
 			/* FIXME: but perhaps we won't have any task using it in
                          * the close future, we should perhaps rather check
                          * mc->replicate->refcnt? */

+ 12 - 0
starpufft/src/cudax_kernels.cu

@@ -30,12 +30,16 @@
 	{			   \
 		dim3 dimGrid(n); \
 		func <<<dimGrid, 1, 0, starpu_cuda_get_local_stream()>>> args; \
+		cudaError_t status = cudaGetLastError(); \
+		if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status); \
 	} 					\
 	else 					\
 	{				     \
 		dim3 dimGrid(n / threads_per_block); \
 		dim3 dimBlock(threads_per_block); \
 		func <<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>> args; \
+		cudaError_t status = cudaGetLastError(); \
+		if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status); \
 	} \
 	cudaStreamSynchronize(starpu_cuda_get_local_stream()); \
 
@@ -85,12 +89,16 @@ extern "C" void STARPUFFT(cuda_twiddle_1d_host)(_cuComplex *out, const _cuComple
 		{			    \
 			dim3 dimGrid(n, m); \
 			func <<<dimGrid, 1, 0, starpu_cuda_get_local_stream()>>> args; \
+			cudaError_t status = cudaGetLastError(); \
+			if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status); \
 		} \
 		else \
 		{					      \
 			dim3 dimGrid(1, m / threads_per_dim); \
 			dim3 dimBlock(n, threads_per_dim); \
 			func <<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>> args; \
+			cudaError_t status = cudaGetLastError(); \
+			if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status); \
 		} \
 	} \
 	else \
@@ -100,12 +108,16 @@ extern "C" void STARPUFFT(cuda_twiddle_1d_host)(_cuComplex *out, const _cuComple
 			dim3 dimGrid(n / threads_per_dim, 1); \
 			dim3 dimBlock(threads_per_dim, m); \
 			func <<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>> args; \
+			cudaError_t status = cudaGetLastError(); \
+			if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status); \
 		} \
 		else \
 		{							\
 			dim3 dimGrid(n / threads_per_dim, m / threads_per_dim); \
 			dim3 dimBlock(threads_per_dim, threads_per_dim); \
 			func <<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>> args; \
+			cudaError_t status = cudaGetLastError(); \
+			if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status); \
 		} \
 	} \
 	cudaStreamSynchronize(starpu_cuda_get_local_stream()); \

+ 2 - 0
starpurm/dev/cuda_vector_scale/vs_cuda_kernel.cu

@@ -59,4 +59,6 @@ extern "C" void vector_scale_cuda_func(void *cl_buffers[], void *cl_arg)
 	unsigned nb_threads_per_block = 64;
 	unsigned nb_blocks = (n + nb_threads_per_block-1) / nb_threads_per_block;
 	vector_scale_cuda_kernel<<<nb_blocks,nb_threads_per_block,0,starpu_cuda_get_local_stream()>>>(vector, n, scalar);
+	cudaError_t status = cudaGetLastError();
+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 }

+ 2 - 0
tests/datawizard/acquire_release_cuda.cu

@@ -26,4 +26,6 @@ extern "C" void increment_cuda(void *descr[], void *cl_arg)
 	unsigned *val = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
 
 	_increment_cuda_codelet<<<1,1, 0, starpu_cuda_get_local_stream()>>>(val);
+	cudaError_t status = cudaGetLastError();
+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 }

+ 2 - 0
tests/datawizard/cuda_codelet_unsigned_inc.cu

@@ -27,4 +27,6 @@ extern "C" void cuda_codelet_unsigned_inc(void *descr[], void *cl_arg)
 	unsigned *val = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
 
 	_cuda_unsigned_inc<<<1,1, 0, starpu_cuda_get_local_stream()>>>(val);
+	cudaError_t status = cudaGetLastError();
+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 }

+ 2 - 0
tests/datawizard/interfaces/bcsr/bcsr_cuda.cu

@@ -60,6 +60,8 @@ extern "C" void test_bcsr_cuda_func(void *buffers[], void *args)
 
         bcsr_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>>
 		(val, nnz, ret, factor);
+	error = cudaGetLastError();
+	if (error != cudaSuccess) STARPU_CUDA_REPORT_ERROR(error);
 
 	error = cudaMemcpyAsync(&bcsr_config.copy_failed,
 			   ret,

+ 2 - 0
tests/datawizard/interfaces/block/block_cuda.cu

@@ -70,6 +70,8 @@ extern "C" void test_block_cuda_func(void *buffers[], void *args)
 
         block_cuda<<<1,1, 0, starpu_cuda_get_local_stream()>>>
 		(block, nx, ny, nz, ldy, ldz, factor, ret);
+	error = cudaGetLastError();
+	if (error != cudaSuccess) STARPU_CUDA_REPORT_ERROR(error);
 	error = cudaMemcpyAsync(&block_config.copy_failed, ret, sizeof(int), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
 	if (error != cudaSuccess)
 		STARPU_CUDA_REPORT_ERROR(error);

+ 2 - 0
tests/datawizard/interfaces/coo/coo_cuda.cu

@@ -57,6 +57,8 @@ extern "C" void test_coo_cuda_func(void *buffers[], void *args)
 
         coo_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>>
 		(val, nvalues, ret, factor);
+	error = cudaGetLastError();
+	if (error != cudaSuccess) STARPU_CUDA_REPORT_ERROR(error);
 
 	error = cudaMemcpyAsync(&coo_config.copy_failed,
 			   ret,

+ 2 - 0
tests/datawizard/interfaces/csr/csr_cuda.cu

@@ -56,6 +56,8 @@ extern "C" void test_csr_cuda_func(void *buffers[], void *args)
 		STARPU_CUDA_REPORT_ERROR(error);
 
         csr_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>> (val, nnz, ret, factor);
+	error = cudaGetLastError();
+	if (error != cudaSuccess) STARPU_CUDA_REPORT_ERROR(error);
 
 	error = cudaMemcpyAsync(&csr_config.copy_failed,
 			   ret,

+ 2 - 0
tests/datawizard/interfaces/matrix/matrix_cuda.cu

@@ -59,6 +59,8 @@ extern "C" void test_matrix_cuda_func(void *buffers[], void *args)
 		STARPU_CUDA_REPORT_ERROR(error);
 
         matrix_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>>(val, n, ret, factor);
+	error = cudaGetLastError();
+	if (error != cudaSuccess) STARPU_CUDA_REPORT_ERROR(error);
 
 	error = cudaMemcpyAsync(&matrix_config.copy_failed,
 			   ret,

+ 2 - 0
tests/datawizard/interfaces/multiformat/multiformat_conversion_codelets_cuda.cu

@@ -45,4 +45,6 @@ extern "C" void cpu_to_cuda_cuda_func(void *buffers[], void *_args)
 	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
 
         cpu_to_cuda_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>>(src, dst, n);
+        cudaError_t status = cudaGetLastError();
+        if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 }

+ 2 - 0
tests/datawizard/interfaces/multiformat/multiformat_cuda.cu

@@ -65,6 +65,8 @@ extern "C" void test_multiformat_cuda_func(void *buffers[], void *args)
 		STARPU_CUDA_REPORT_ERROR(error);
 
         multiformat_cuda<<<nblocks,threads_per_block,2,starpu_cuda_get_local_stream()>>>(soa, n, ret, factor);
+	error = cudaGetLastError();
+	if (error != cudaSuccess) STARPU_CUDA_REPORT_ERROR(error);
 
 	error = cudaMemcpyAsync(&multiformat_config.copy_failed,
 			   ret,

+ 2 - 0
tests/datawizard/interfaces/tensor/tensor_cuda.cu

@@ -75,6 +75,8 @@ extern "C" void test_tensor_cuda_func(void *buffers[], void *args)
 
         tensor_cuda<<<1,1, 0, starpu_cuda_get_local_stream()>>>
 		(tensor, nx, ny, nz, nt, ldy, ldz, ldt, factor, ret);
+	error = cudaGetLastError();
+	if (error != cudaSuccess) STARPU_CUDA_REPORT_ERROR(error);
 	error = cudaMemcpyAsync(&tensor_config.copy_failed, ret, sizeof(int), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
 	if (error != cudaSuccess)
 		STARPU_CUDA_REPORT_ERROR(error);

+ 2 - 0
tests/datawizard/interfaces/variable/variable_cuda.cu

@@ -55,6 +55,8 @@ extern "C" void test_variable_cuda_func(void *buffers[], void *args)
 	unsigned nblocks = 1;
 
         variable_cuda<<<nblocks,threads_per_block,0,starpu_cuda_get_local_stream()>>>(val, ret, factor);
+	error = cudaGetLastError();
+	if (error != cudaSuccess) STARPU_CUDA_REPORT_ERROR(error);
 	error = cudaMemcpyAsync(&variable_config.copy_failed,
 			   ret,
 			   sizeof(int),

+ 2 - 0
tests/datawizard/interfaces/vector/vector_cuda.cu

@@ -55,6 +55,8 @@ extern "C" void test_vector_cuda_func(void *buffers[], void *args)
 	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
 
         framework_cuda<<<nblocks,threads_per_block,0,starpu_cuda_get_local_stream()>>>(val, n, ret, factor);
+	error = cudaGetLastError();
+	if (error != cudaSuccess) STARPU_CUDA_REPORT_ERROR(error);
 	error = cudaMemcpyAsync(&vector_config.copy_failed, ret, sizeof(int), cudaMemcpyDeviceToHost, starpu_cuda_get_local_stream());
 	if (error != cudaSuccess)
 	{

+ 2 - 0
tests/datawizard/scal_cuda.cu

@@ -32,4 +32,6 @@ extern "C" void scal_func_cuda(void *buffers[], void *_args)
 	unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
 
         vector_mult_cuda<<<nblocks,threads_per_block,0,starpu_cuda_get_local_stream()>>>(val, n);
+	cudaError_t status = cudaGetLastError();
+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 }

+ 2 - 0
tests/datawizard/scratch_cuda.cu

@@ -46,4 +46,6 @@ extern "C" void cuda_f(void *descr[], void *_args)
 	unsigned nthread_per_block = STARPU_MIN(MAXTHREADSPERBLOCK, (nx / nblocks));
 
 	increment_vector<<<nblocks, nthread_per_block, 0, starpu_cuda_get_local_stream()>>>(v, tmp, nx);
+	cudaError_t status = cudaGetLastError();
+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 }

+ 4 - 0
tests/datawizard/sync_and_notify_data_kernels.cu

@@ -30,6 +30,8 @@ extern "C" void cuda_codelet_incA(void *descr[], void *_args)
 	unsigned *v = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
 
 	_cuda_incA<<<1,1, 0, starpu_cuda_get_local_stream()>>>(v);
+	cudaError_t status = cudaGetLastError();
+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 }
 
 /*
@@ -46,4 +48,6 @@ extern "C" void cuda_codelet_incC(void *descr[], void *_args)
 	unsigned *v = (unsigned *)STARPU_VECTOR_GET_PTR(descr[0]);
 
 	_cuda_incC<<<1,1, 0, starpu_cuda_get_local_stream()>>>(v);
+	cudaError_t status = cudaGetLastError();
+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 }

+ 2 - 0
tests/main/increment.cu

@@ -27,4 +27,6 @@ extern "C" void cuda_host_increment(void *descr[], void *_args)
 	unsigned *var = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
 
 	cuda_increment<<<1,1, 0, starpu_cuda_get_local_stream()>>>(var);
+	cudaError_t status = cudaGetLastError();
+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 }

+ 2 - 0
tests/overlap/long_kernel.cu

@@ -29,4 +29,6 @@ extern "C" void long_kernel_cuda(unsigned long niters)
 	dim3 dimBlock(1,1);
 	dim3 dimGrid(1,1);
 	long_kernel<<<dimGrid, dimBlock, 0, starpu_cuda_get_local_stream()>>>(niters);
+	cudaError_t status = cudaGetLastError();
+	if (status != cudaSuccess) STARPU_CUDA_REPORT_ERROR(status);
 }