浏览代码

CUDA and OpenCL drivers: when STARPU_CUDA_PIPELINE and STARPU_OPENCL_PIPELINE are set to 0, force synchronous execution of all kernels

Nathalie Furmento 10 年之前
父节点
当前提交
1ae61a95f9
共有 4 个文件被更改,包括 48 次插入24 次删除
  1. 2 1
      ChangeLog
  2. 4 2
      doc/doxygen/chapters/40environment_variables.doxy
  3. 20 9
      src/drivers/cuda/driver_cuda.c
  4. 22 12
      src/drivers/opencl/driver_opencl.c

+ 2 - 1
ChangeLog

@@ -47,7 +47,8 @@ New features:
     CUDA and OpenCL kernel execution.
   * Add STARPU_CUDA_PIPELINE and STARPU_OPENCL_PIPELINE to specify how
     many asynchronous tasks are submitted in advance on CUDA and
-    OpenCL devices.
+    OpenCL devices. Setting the value to 0 forces a synchronous
+    execution of all tasks.
   * Add CUDA concurrent kernel execution support through
     the STARPU_NWORKER_PER_CUDA environment variable.
   * Add CUDA and OpenCL kernel submission pipelining, to overlap costs and allow

+ 4 - 2
doc/doxygen/chapters/40environment_variables.doxy

@@ -58,7 +58,8 @@ which will be concurrently running on the devices. The default value is 1.
 Specify how many asynchronous tasks are submitted in advance on CUDA
 devices. This for instance permits to overlap task management with the execution
 of previous tasks, but it also allows concurrent execution on Fermi cards, which
-otherwise bring spurious synchronizations. The default is 2.
+otherwise bring spurious synchronizations. The default is 2. Setting the value to 0 forces a synchronous
+execution of all tasks.
 </dd>
 
 <dt>STARPU_NOPENCL</dt>
@@ -75,7 +76,8 @@ OpenCL equivalent of the environment variable \ref STARPU_NCUDA.
 Specify how many asynchronous tasks are submitted in advance on OpenCL
 devices. This for instance permits to overlap task management with the execution
 of previous tasks, but it also allows concurrent execution on Fermi cards, which
-otherwise bring spurious synchronizations. The default is 2.
+otherwise bring spurious synchronizations. The default is 2. Setting the value to 0 forces a synchronous
+execution of all tasks.
 </dd>
 
 <dt>STARPU_NMICDEVS</dt>

+ 20 - 9
src/drivers/cuda/driver_cuda.c

@@ -472,16 +472,27 @@ static void execute_job_on_cuda(struct starpu_task *task, struct _starpu_worker
 #ifndef STARPU_SIMGRID
 	if (task->cl->cuda_flags[j->nimpl] & STARPU_CUDA_ASYNC)
 	{
-		/* Record event to synchronize with task termination later */
-		cudaEventRecord(task_events[workerid][(worker->first_task + worker->ntasks - 1)%STARPU_MAX_PIPELINE], starpu_cuda_get_local_stream());
+		if (worker->pipeline_length == 0)
+		{
+			cudaStreamSynchronize(starpu_cuda_get_local_stream());
+#if defined(STARPU_DEBUG) && !defined(STARPU_SIMGRID)
+			STARPU_ASSERT_MSG(cudaStreamQuery(starpu_cuda_get_local_stream()) == cudaSuccess, "CUDA codelets have to wait for termination of their kernels on the starpu_cuda_get_local_stream() stream");
+#endif
+			finish_job_on_cuda(j, worker);
+		}
+		else
+		{
+			/* Record event to synchronize with task termination later */
+			cudaEventRecord(task_events[workerid][(worker->first_task + worker->ntasks - 1)%STARPU_MAX_PIPELINE], starpu_cuda_get_local_stream());
 #ifdef STARPU_USE_FXT
-		int k;
-		for (k = 0; k < (int) worker->set->nworkers; k++)
-			if (worker->set->workers[k].ntasks == worker->set->workers[k].pipeline_length)
-				break;
-		if (k == (int) worker->set->nworkers)
-			/* Everybody busy */
-			_STARPU_TRACE_START_EXECUTING()
+			int k;
+			for (k = 0; k < (int) worker->set->nworkers; k++)
+				if (worker->set->workers[k].ntasks == worker->set->workers[k].pipeline_length)
+					break;
+			if (k == (int) worker->set->nworkers)
+				/* Everybody busy */
+				_STARPU_TRACE_START_EXECUTING();
+		}
 #endif
 	}
 	else

+ 22 - 12
src/drivers/opencl/driver_opencl.c

@@ -903,18 +903,28 @@ static void _starpu_opencl_execute_job(struct starpu_task *task, struct _starpu_
 		int err;
 		cl_command_queue queue;
 		starpu_opencl_get_queue(worker->devid, &queue);
-		/* the function clEnqueueMarker is deprecated from
-		 * OpenCL version 1.2. We would like to use the new
-		 * function clEnqueueMarkerWithWaitList. We could do
-		 * it by checking its availability through our own
-		 * configure macro HAVE_CLENQUEUEMARKERWITHWAITLIST
-		 * and the OpenCL macro CL_VERSION_1_2. However these
-		 * 2 macros detect the function availability in the
-		 * ICD and not in the device implementation.
-		 */
-		err = clEnqueueMarker(queue, &task_events[worker->devid][(worker->first_task + worker->ntasks - 1)%STARPU_MAX_PIPELINE]);
-		if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
-		_STARPU_TRACE_START_EXECUTING();
+
+		if (worker->pipeline_length == 0)
+		{
+			starpu_opencl_get_queue(worker->devid, &queue);
+			clFinish(queue);
+			_starpu_opencl_stop_job(j, worker);
+		}
+		else
+		{
+			/* the function clEnqueueMarker is deprecated from
+			 * OpenCL version 1.2. We would like to use the new
+			 * function clEnqueueMarkerWithWaitList. We could do
+			 * it by checking its availability through our own
+			 * configure macro HAVE_CLENQUEUEMARKERWITHWAITLIST
+			 * and the OpenCL macro CL_VERSION_1_2. However these
+			 * 2 macros detect the function availability in the
+			 * ICD and not in the device implementation.
+			 */
+			err = clEnqueueMarker(queue, &task_events[worker->devid][(worker->first_task + worker->ntasks - 1)%STARPU_MAX_PIPELINE]);
+			if (STARPU_UNLIKELY(err != CL_SUCCESS)) STARPU_OPENCL_REPORT_ERROR(err);
+			_STARPU_TRACE_START_EXECUTING();
+		}
 	}
 	else
 #else