10 years ago · 1ae61a95f9
--- a/ChangeLog
+++ b/ChangeLog
@@ -47,7 +47,8 @@ New features:
 
				     CUDA and OpenCL kernel execution.
			
 
				   * Add STARPU_CUDA_PIPELINE and STARPU_OPENCL_PIPELINE to specify how
			
 
				     many asynchronous tasks are submitted in advance on CUDA and
			
 
				-    OpenCL devices.
			
 
				+    OpenCL devices. Setting the value to 0 forces a synchronous
			
 
				+    execution of all tasks.
			
 
				   * Add CUDA concurrent kernel execution support through
			
 
				     the STARPU_NWORKER_PER_CUDA environment variable.
			
 
				   * Add CUDA and OpenCL kernel submission pipelining, to overlap costs and allow
			
--- a/doc/doxygen/chapters/40environment_variables.doxy
+++ b/doc/doxygen/chapters/40environment_variables.doxy
@@ -58,7 +58,8 @@ which will be concurrently running on the devices. The default value is 1.
 
				 Specify how many asynchronous tasks are submitted in advance on CUDA
			
 
				 devices. This for instance permits to overlap task management with the execution
			
 
				 of previous tasks, but it also allows concurrent execution on Fermi cards, which
			
 
				-otherwise bring spurious synchronizations. The default is 2.
			
 
				+otherwise bring spurious synchronizations. The default is 2. Setting the value to 0 forces a synchronous
			
 
				+execution of all tasks.
			
 
				 </dd>
			
 
				 
			
 
				 <dt>STARPU_NOPENCL</dt>
			
@@ -75,7 +76,8 @@ OpenCL equivalent of the environment variable \ref STARPU_NCUDA.
 
				 Specify how many asynchronous tasks are submitted in advance on OpenCL
			
 
				 devices. This for instance permits to overlap task management with the execution
			
 
				 of previous tasks, but it also allows concurrent execution on Fermi cards, which
			
 
				-otherwise bring spurious synchronizations. The default is 2.
			
 
				+otherwise bring spurious synchronizations. The default is 2. Setting the value to 0 forces a synchronous
			
 
				+execution of all tasks.
			
 
				 </dd>
			
 
				 
			
 
				 <dt>STARPU_NMICDEVS</dt>
			
--- a/src/drivers/cuda/driver_cuda.c
+++ b/src/drivers/cuda/driver_cuda.c
@@ -472,16 +472,27 @@ static void execute_job_on_cuda(struct starpu_task *task, struct _starpu_worker
 
				 #ifndef STARPU_SIMGRID
			
 
				 	if (task->cl->cuda_flags[j->nimpl] & STARPU_CUDA_ASYNC)
			
 
				 	{
			
 
				-		/* Record event to synchronize with task termination later */
			
 
				-		cudaEventRecord(task_events[workerid][(worker->first_task + worker->ntasks - 1)%STARPU_MAX_PIPELINE], starpu_cuda_get_local_stream());
			
 
				+		if (worker->pipeline_length == 0)
			
 
				+		{
			
 
				+			cudaStreamSynchronize(starpu_cuda_get_local_stream());
			
 
				+#if defined(STARPU_DEBUG) && !defined(STARPU_SIMGRID)
			
 
				+			STARPU_ASSERT_MSG(cudaStreamQuery(starpu_cuda_get_local_stream()) == cudaSuccess, "CUDA codelets have to wait for termination of their kernels on the starpu_cuda_get_local_stream() stream");
			
 
				+#endif
			
 
				+			finish_job_on_cuda(j, worker);
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			/* Record event to synchronize with task termination later */
			
 
				+			cudaEventRecord(task_events[workerid][(worker->first_task + worker->ntasks - 1)%STARPU_MAX_PIPELINE], starpu_cuda_get_local_stream());
			
 
				 #ifdef STARPU_USE_FXT
			
 
				-		int k;
			
 
				-		for (k = 0; k < (int) worker->set->nworkers; k++)
			
 
				-			if (worker->set->workers[k].ntasks == worker->set->workers[k].pipeline_length)
			
 
				-				break;
			
 
				-		if (k == (int) worker->set->nworkers)
			
 
				-			/* Everybody busy */
			
 
				-			_STARPU_TRACE_START_EXECUTING()
			
 
				+			int k;
			
 
				+			for (k = 0; k < (int) worker->set->nworkers; k++)
			
 
				+				if (worker->set->workers[k].ntasks == worker->set->workers[k].pipeline_length)
			
 
				+					break;
			
 
				+			if (k == (int) worker->set->nworkers)
			
 
				+				/* Everybody busy */
			
 
				+				_STARPU_TRACE_START_EXECUTING();
			
 
				+		}
			
 
				 #endif
			
 
				 	}
			
 
				 	else
			
--- a/src/drivers/opencl/driver_opencl.c
+++ b/src/drivers/opencl/driver_opencl.c
@@ -903,18 +903,28 @@ static void _starpu_opencl_execute_job(struct starpu_task *task, struct _starpu_
 
				 		int err;
			
 
				 		cl_command_queue queue;
			
 
				 		starpu_opencl_get_queue(worker->devid, &queue);
			
 
				-		/* the function clEnqueueMarker is deprecated from
			
 
				-		 * OpenCL version 1.2. We would like to use the new
			
 
				-		 * function clEnqueueMarkerWithWaitList. We could do
			
 
				-		 * it by checking its availability through our own
			
 
				-		 * configure macro HAVE_CLENQUEUEMARKERWITHWAITLIST
			
 
				-		 * and the OpenCL macro CL_VERSION_1_2. However these
			
 
				-		 * 2 macros detect the function availability in the
			
 
				-		 * ICD and not in the device implementation.
			
 
				-		 */
			
 
				-		err = clEnqueueMarker(queue, &task_events[worker->devid][(worker->first_task + worker->ntasks - 1)%STARPU_MAX_PIPELINE]);
			
 
				-		if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				-		_STARPU_TRACE_START_EXECUTING();
			
 
				+
			
 
				+		if (worker->pipeline_length == 0)
			
 
				+		{
			
 
				+			starpu_opencl_get_queue(worker->devid, &queue);
			
 
				+			clFinish(queue);
			
 
				+			_starpu_opencl_stop_job(j, worker);
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			/* the function clEnqueueMarker is deprecated from
			
 
				+			 * OpenCL version 1.2. We would like to use the new
			
 
				+			 * function clEnqueueMarkerWithWaitList. We could do
			
 
				+			 * it by checking its availability through our own
			
 
				+			 * configure macro HAVE_CLENQUEUEMARKERWITHWAITLIST
			
 
				+			 * and the OpenCL macro CL_VERSION_1_2. However these
			
 
				+			 * 2 macros detect the function availability in the
			
 
				+			 * ICD and not in the device implementation.
			
 
				+			 */
			
 
				+			err = clEnqueueMarker(queue, &task_events[worker->devid][(worker->first_task + worker->ntasks - 1)%STARPU_MAX_PIPELINE]);
			
 
				+			if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
			
 
				+			_STARPU_TRACE_START_EXECUTING();
			
 
				+		}
			
 
				 	}
			
 
				 	else
			
 
				 #else