7 years ago · 20fd29cfa3
--- a/src/drivers/cuda/driver_cuda.c
+++ b/src/drivers/cuda/driver_cuda.c
@@ -62,6 +62,7 @@ static nvmlDevice_t nvmlDev[STARPU_MAXCUDADEVS];
 
				 int _starpu_cuda_bus_ids[STARPU_MAXCUDADEVS+STARPU_MAXNUMANODES][STARPU_MAXCUDADEVS+STARPU_MAXNUMANODES];
			
 
				 #ifdef STARPU_USE_CUDA
			
 
				 static cudaStream_t streams[STARPU_NMAXWORKERS];
			
 
				+static char used_stream[STARPU_NMAXWORKERS];
			
 
				 static cudaStream_t out_transfer_streams[STARPU_MAXCUDADEVS];
			
 
				 static cudaStream_t in_transfer_streams[STARPU_MAXCUDADEVS];
			
 
				 /* Note: streams are not thread-safe, so we define them for each CUDA worker
			
@@ -224,6 +225,7 @@ cudaStream_t starpu_cuda_get_local_stream(void)
 
				 {
			
 
				 	int worker = starpu_worker_get_id_check();
			
 
				 
			
 
				+	used_stream[worker] = 1;
			
 
				 	return streams[worker];
			
 
				 }
			
 
				 
			
@@ -613,6 +615,14 @@ static void execute_job_on_cuda(struct starpu_task *task, struct _starpu_worker
 
				 		}
			
 
				 	}
			
 
				 
			
 
				+#ifndef STARPU_SIMGRID
			
 
				+	if (!used_stream[workerid])
			
 
				+	{
			
 
				+		used_stream[workerid] = 1;
			
 
				+		_STARPU_DISP("Warning: starpu_cuda_get_local_stream() was not used to submit kernel to CUDA on worker %d. CUDA will thus introduce a lot of useless synchronizations, which will prevent proper overlapping of data transfers and kernel execution. See the CUDA-specific part of the 'Check List When Performance Are Not There' of the StarPU handbook\n", workerid);
			
 
				+	}
			
 
				+#endif
			
 
				+
			
 
				 	if (task->cl->cuda_flags[j->nimpl] & STARPU_CUDA_ASYNC)
			
 
				 	{
			
 
				 		if (worker->pipeline_length == 0)